In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pickle
from ssfeature import get_ssfeature
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
from functions import load_pickle, dump_pickle

# Load data

In [2]:
topt_train = pd.read_csv('../data/Topt/train_os.csv')
topt_test = pd.read_csv('../data/Topt/test.csv')
phopt_train = pd.read_csv('../data/pHopt/train_pH.csv')
phopt_test = pd.read_csv('../data/pHopt/test_pH.csv')
tm_train = pd.read_csv('../data/Tm/Tm_Train.csv')
tm_test = pd.read_csv('../data/Tm/Tm_Test.csv')

# Compute SSFs

In [4]:
def get_ssf_table(table, target_column, seq_column='sequence'):
    data = []
    for i in range(len(table.index)):
        temp = {target_column: list(table[target_column])[i] }
        temp.update( get_ssfeature( list(table[seq_column])[i] ) )
        data.append( temp )
    result = pd.DataFrame(data)
    return result

In [5]:
# topt_ssf_train = get_ssf_table(topt_train,'topt')
# dump_pickle(topt_ssf_train,'../data/Topt/train_ssf.pkl')
# topt_ssf_test = get_ssf_table(topt_test,'topt')
# dump_pickle(topt_ssf_test,'../data/Topt/test_ssf.pkl')

# phopt_ssf_train = get_ssf_table( phopt_train,'pHopt')
# dump_pickle( phopt_ssf_train,'../data/pHopt/train_ssf.pkl')
# phopt_ssf_test = get_ssf_table( phopt_test,'pHopt')
# dump_pickle( phopt_ssf_test,'../data/pHopt/test_ssf.pkl')

# tm_ssf_train = get_ssf_table( tm_train,'tm')
# dump_pickle( tm_ssf_train,'../data/Tm/train_ssf.pkl')
# tm_ssf_test = get_ssf_table( tm_test,'tm')
# dump_pickle( tm_ssf_test,'../data/Tm/test_ssf.pkl')

# Pearson Correlation

In [42]:
topt_ssf_train = load_pickle('../data/Topt/train_ssf.pkl'); topt_ssf_test = load_pickle('../data/Topt/test_ssf.pkl')
phopt_ssf_train = load_pickle('../data/pHopt/train_ssf.pkl')
phopt_ssf_test = load_pickle('../data/pHopt/test_ssf.pkl')
tm_ssf_train = load_pickle('../data/Tm/train_ssf.pkl'); tm_ssf_test = load_pickle('../data/Tm/test_ssf.pkl')

In [43]:
topt_ssf = pd.concat([topt_ssf_train,topt_ssf_test])
phopt_ssf = pd.concat([phopt_ssf_train, phopt_ssf_test])
tm_ssf = pd.concat([tm_ssf_train, tm_ssf_test])

In [32]:
# pv_cut = 0.01
# sigf_topt = []; sigf_phopt = []; sigf_tm = []
# for i in range(1,len(topt_ssf.columns)):
#     res = pearsonr( list(topt_ssf[topt_ssf.columns[i]]), list(topt_ssf.topt) )
#     if res.pvalue <= pv_cut:
#         sigf_topt.append(topt_ssf.columns[i])

# for i in range(1,len(phopt_ssf.columns)):
#     res = pearsonr( list(phopt_ssf[phopt_ssf.columns[i]]), list(phopt_ssf.pHopt) )
#     if res.pvalue <= pv_cut:
#         sigf_phopt.append(phopt_ssf.columns[i])

# for i in range(1,len(tm_ssf.columns)):
#     res = pearsonr( list(tm_ssf[tm_ssf.columns[i]]), list(tm_ssf.tm) )
#     if res.pvalue <= pv_cut:
#         sigf_tm.append(tm_ssf.columns[i])
# sigfeatures = set(sigf_topt)&set(sigf_phopt)&set(sigf_tm)
# print(len(sigfeatures))



503


In [34]:
# dump_pickle(sigfeatures,'../data/sig_ssfs.pkl')

In [44]:
sigfeatures = list( load_pickle('../data/sig_ssfs.pkl') )
topt_ssf = topt_ssf[['topt']+sigfeatures]
phopt_ssf = phopt_ssf[['pHopt']+sigfeatures]
tm_ssf = tm_ssf[['tm']+sigfeatures]

# PCA

In [45]:
X_topt = topt_ssf.iloc[:,1:].values
X_topt = StandardScaler().fit_transform(X_topt)

X_ph = phopt_ssf.iloc[:,1:].values
X_ph = StandardScaler().fit_transform(X_ph)

X_tm = tm_ssf.iloc[:,1:].values
X_tm = StandardScaler().fit_transform(X_tm)

In [46]:
pca_topt = PCA(n_components=2)
topt_pc = pca_topt.fit_transform(X_topt)
print('Topt:'); print(pca_topt.explained_variance_ratio_)

pca_ph = PCA(n_components=2)
ph_pc = pca_ph.fit_transform(X_ph)
print('pHopt:'); print(pca_ph.explained_variance_ratio_)

pca_tm = PCA(n_components=2)
tm_pc = pca_tm.fit_transform(X_tm)
print('Tm:'); print(pca_tm.explained_variance_ratio_)

Topt:
[0.08140898 0.05864385]
pHopt:
[0.06641301 0.04667219]
Tm:
[0.05993128 0.04868127]


In [None]:
# low explained variance