In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
from ssfeature import get_ssfeature
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from functions import *
import torch

# Load data

In [3]:
topt_train = pd.read_csv('../data/Topt/train_os.csv')
topt_test = pd.read_csv('../data/Topt/test.csv')
phopt_train = pd.read_csv('../data/pHopt/train_pH.csv')
phopt_test = pd.read_csv('../data/pHopt/test_pH.csv')
tm_train = pd.read_csv('../data/Tm/Tm_Train.csv')
tm_test = pd.read_csv('../data/Tm/Tm_Test.csv')

# Compute SSFs

In [4]:
def get_ssf_table(table, target_column, seq_column='sequence'):
    data = []
    for i in range(len(table.index)):
        temp = {target_column: list(table[target_column])[i] }
        temp.update( get_ssfeature( list(table[seq_column])[i] ) )
        data.append( temp )
    result = pd.DataFrame(data)
    return result

In [5]:
topt_ssf_train = get_ssf_table(topt_train,'topt')
topt_ssf_test = get_ssf_table(topt_test,'topt')

phopt_ssf_train = get_ssf_table( phopt_train,'pHopt')
phopt_ssf_test = get_ssf_table( phopt_test,'pHopt')

tm_ssf_train = get_ssf_table( tm_train,'tm')
tm_ssf_test = get_ssf_table( tm_test,'tm')

In [6]:
tm_ssf_test.head()

Unnamed: 0,tm,AAC_A,AAC_C,AAC_D,AAC_E,AAC_F,AAC_G,AAC_H,AAC_I,AAC_K,...,_HydrophobicityD2001,_HydrophobicityD2025,_HydrophobicityD2050,_HydrophobicityD2075,_HydrophobicityD2100,_HydrophobicityD3001,_HydrophobicityD3025,_HydrophobicityD3050,_HydrophobicityD3075,_HydrophobicityD3100
0,66.5,0.064706,0.014706,0.05,0.076471,0.020588,0.073529,0.035294,0.076471,0.05,...,1.471,26.765,52.941,77.059,100.0,0.294,23.824,49.706,70.0,98.824
1,62.1,0.105263,0.026316,0.078947,0.078947,0.026316,0.036842,0.021053,0.031579,0.031579,...,4.211,28.947,56.316,71.053,99.474,0.526,21.579,47.368,75.789,100.0
2,69.2,0.096154,0.0,0.038462,0.141026,0.00641,0.044872,0.00641,0.057692,0.108974,...,1.282,11.538,44.231,68.59,99.359,9.615,33.333,63.462,84.615,98.718
3,67.3,0.042471,0.007722,0.057915,0.073359,0.046332,0.073359,0.011583,0.07722,0.081081,...,1.544,26.255,49.035,75.676,98.842,0.386,16.216,46.332,73.745,100.0
4,63.7,0.071429,0.005495,0.071429,0.082418,0.049451,0.043956,0.021978,0.120879,0.087912,...,2.747,19.231,50.0,76.923,96.703,0.549,24.176,48.352,71.429,99.451


# Pearson Correlation

In [7]:
topt_ssf = pd.concat([topt_ssf_train,topt_ssf_test])
phopt_ssf = pd.concat([phopt_ssf_train, phopt_ssf_test])
tm_ssf = pd.concat([tm_ssf_train, tm_ssf_test])

In [10]:
pv_cut = 0.01
sigf_topt = []; sigf_phopt = []; sigf_tm = []
for i in range(1,len(topt_ssf.columns)):
    res = pearsonr( list(topt_ssf[topt_ssf.columns[i]]), list(topt_ssf.topt) )
    if res.pvalue <= pv_cut:
        sigf_topt.append(topt_ssf.columns[i])

for i in range(1,len(phopt_ssf.columns)):
    res = pearsonr( list(phopt_ssf[phopt_ssf.columns[i]]), list(phopt_ssf.pHopt) )
    if res.pvalue <= pv_cut:
        sigf_phopt.append(phopt_ssf.columns[i])

for i in range(1,len(tm_ssf.columns)):
    res = pearsonr( list(tm_ssf[tm_ssf.columns[i]]), list(tm_ssf.tm) )
    if res.pvalue <= pv_cut:
        sigf_tm.append(tm_ssf.columns[i])
sigfeatures = list ( set(sigf_topt)&set(sigf_phopt)&set(sigf_tm) )
sigfeatures = sorted(sigfeatures)
print(len(sigfeatures))
# dump_pickle(sigfeatures,'../data/sig_ssfs.pkl')



503


# Save ssf as ndarray

In [15]:
sigfeatures = list( load_pickle('../data/sig_ssfs.pkl') )
topt_ssf_train = topt_ssf_train[sigfeatures]
topt_ssf_test = topt_ssf_test[sigfeatures]
phopt_ssf_train = phopt_ssf_train[sigfeatures]
phopt_ssf_test = phopt_ssf_test[sigfeatures]
tm_ssf_train = tm_ssf_train[sigfeatures]
tm_ssf_test = tm_ssf_test[sigfeatures]

# dump_pickle(topt_ssf_train.values,'../data/Topt/train_ssf.pkl')
# dump_pickle(topt_ssf_test.values,'../data/Topt/test_ssf.pkl')
# dump_pickle(phopt_ssf_train.values,'../data/pHopt/train_ssf.pkl')
# dump_pickle(phopt_ssf_test.values,'../data/pHopt/test_ssf.pkl')
# dump_pickle(tm_ssf_train.values,'../data/Tm/train_ssf.pkl')
# dump_pickle(tm_ssf_test.values,'../data/Tm/test_ssf.pkl')

# PCA

In [44]:
sigfeatures = list( load_pickle('../data/sig_ssfs.pkl') )
topt_ssf = topt_ssf[['topt']+sigfeatures]
phopt_ssf = phopt_ssf[['pHopt']+sigfeatures]
tm_ssf = tm_ssf[['tm']+sigfeatures]

In [45]:
X_topt = topt_ssf.iloc[:,1:].values
X_topt = StandardScaler().fit_transform(X_topt)

X_ph = phopt_ssf.iloc[:,1:].values
X_ph = StandardScaler().fit_transform(X_ph)

X_tm = tm_ssf.iloc[:,1:].values
X_tm = StandardScaler().fit_transform(X_tm)

In [46]:
pca_topt = PCA(n_components=2)
topt_pc = pca_topt.fit_transform(X_topt)
print('Topt:'); print(pca_topt.explained_variance_ratio_)

pca_ph = PCA(n_components=2)
ph_pc = pca_ph.fit_transform(X_ph)
print('pHopt:'); print(pca_ph.explained_variance_ratio_)

pca_tm = PCA(n_components=2)
tm_pc = pca_tm.fit_transform(X_tm)
print('Tm:'); print(pca_tm.explained_variance_ratio_)

Topt:
[0.08140898 0.05864385]
pHopt:
[0.06641301 0.04667219]
Tm:
[0.05993128 0.04868127]


In [None]:
# low explained variance

# code test

In [16]:
train_ssf = load_pickle('../data/Topt/train_ssf.pkl'); test_ssf = load_pickle('../data/Topt/test_ssf.pkl')
sig_ssfs = list( load_pickle('../data/sig_ssfs.pkl') )
train_ssf = train_ssf[sig_ssfs]; test_ssf = test_ssf[sig_ssfs]
train_data = pd.read_csv('../data/Topt/train_os.csv')

In [22]:
train_pack = [np.array(train_data.uniprot_id), np.array(train_data.sequence), train_ssf.values, \
              np.array( rescale_targets(list(train_data['topt']),120, 0 )) ];

In [23]:
def split_data( data, ratio=0.1):
    idx = np.arange(len( data[0]))
    np.random.shuffle(idx)
    num_split = int(len(data[0]) * ratio)
    idx_1, idx_0 = idx[:num_split], idx[num_split:]
    data_0 = [ data[di][idx_0] for di in range(len(data))]
    data_1 = [ data[di][idx_1] for di in range(len(data))]
    return data_0, data_1

In [24]:
train_pack, dev_pack = split_data( train_pack, 0.1)

In [33]:
idx = np.arange(len(train_pack[0]))
i=10; min_size=4
batch_data = [train_pack[di][idx[ i* min_size: (i + 1) * min_size]] for di in range(len(train_pack))]

In [35]:
ids, seqs, ssfs, targets = batch_data

In [38]:
target_values = torch.FloatTensor( np.array( [ np.array([v]) for v in targets ] ) )

In [44]:
target_values.size()

torch.Size([4, 1])

In [40]:
temp = torch.FloatTensor( ssfs )