In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler


In [2]:
%cd ..

/home/jan/misc/copenhagen-hack/optimal-ph/aux/model-playground


In [3]:
frequencies = pd.read_csv("data/1-mers.tsv", sep="\t")
frequencies

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.069395,0.008897,0.081851,0.055160,0.039146,0.083630,0.014235,0.062278,0.087189,0.072954,0.024911,0.069395,0.039146,0.017794,0.019573,0.058719,0.072954,0.042705,0.012456,0.067616
1,0.100739,0.003358,0.069174,0.046340,0.034923,0.089993,0.006716,0.040967,0.076561,0.058428,0.014103,0.061115,0.030893,0.024849,0.022834,0.069174,0.101410,0.083949,0.016790,0.047683
2,0.090598,0.008547,0.046154,0.015385,0.034188,0.099145,0.006838,0.058120,0.047863,0.068376,0.018803,0.061538,0.061538,0.022222,0.013675,0.102564,0.114530,0.059829,0.013675,0.056410
3,0.102216,0.000715,0.064332,0.038599,0.027162,0.071480,0.010007,0.060043,0.064332,0.075768,0.010722,0.054325,0.068620,0.018585,0.012866,0.105790,0.090064,0.075054,0.001430,0.047891
4,0.117509,0.008226,0.059929,0.023502,0.024677,0.088132,0.015276,0.041128,0.041128,0.057579,0.023502,0.070505,0.034078,0.029377,0.025852,0.108108,0.089307,0.065805,0.024677,0.051704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104996,0.125954,0.007634,0.061069,0.022901,0.061069,0.099237,0.019084,0.030534,0.034351,0.068702,0.026718,0.030534,0.053435,0.019084,0.099237,0.091603,0.053435,0.038168,0.019084,0.038168
104997,0.140044,0.000000,0.061269,0.070022,0.028446,0.089716,0.013129,0.037199,0.006565,0.096280,0.017505,0.008753,0.070022,0.021882,0.098468,0.045952,0.070022,0.098468,0.010941,0.015317
104998,0.086253,0.000000,0.075472,0.061995,0.035040,0.110512,0.013477,0.067385,0.032345,0.075472,0.024259,0.026954,0.059299,0.029650,0.043127,0.075472,0.078167,0.097035,0.002695,0.005391
104999,0.146667,0.015238,0.076190,0.043810,0.032381,0.102857,0.030476,0.064762,0.026667,0.074286,0.034286,0.030476,0.066667,0.038095,0.059048,0.041905,0.040000,0.051429,0.001905,0.022857


In [4]:
train_meta = pd.read_csv("../train.csv", index_col=0)
valid_meta = pd.read_csv("../valid.csv", index_col=0)
train_meta

Unnamed: 0,mean_growth_PH,sequence,representative,is7
99973,7.40,MEANHGMNNYIKLAFVFGITTMATSYADTVAPPTLLTAQKLPQLQQ...,True,False
84793,7.00,MEFFKKTALAALVMGFSGAALALPNITILATGGTIAGGGDSATKSN...,False,True
27864,6.50,MSPLGILRRHRVAALLGAALIISPVVVSFAQSANSTGVSKIVATTQ...,True,False
46228,7.80,MKKQYWYVIITYVAMQLSSLVGVPLLAHSGFINASNKDIAISIASG...,False,False
6028,7.00,MSKKKMAITLSAMLSATIIPSFTMDVHAEKKEETKNTKIELENGMT...,False,True
...,...,...,...,...
69002,7.00,MEIIMRNLCFLLTLVATLLLHGRLIAAALPQDEKLITGQLDNGLRY...,False,True
75674,7.00,MSKHPKLLVLALACLACAGRASAAPASDEVARLAQRCAPDVSPLTM...,False,True
50377,7.45,MSRAGSLMLVLGTALWLCGCSGMNSENKRVAPVAEKRPHTMSLHGV...,True,False
87875,7.00,MSAGRLNKKSLGIVMLLSVGLLLAGCSGSKSSDTGTYSGSVYTVKR...,False,True


In [5]:
phychem = pd.read_csv("../../data/physchem/properties.csv", index_col=0).drop("ID", axis=1).reset_index(drop=True)
phychem

Unnamed: 0,isoelectricity,length,hydrophobicity,weight,aliphatic
0,4.860335,562,-0.510676,62824.47114,72.064057
1,5.032207,1489,-0.343586,160513.00714,73.183345
2,7.378459,585,-0.076068,61639.23834,75.743590
3,4.722769,1399,-0.150322,147316.17644,84.953538
4,5.476988,851,-0.269330,90706.29954,69.330200
...,...,...,...,...,...
104996,10.461661,262,-0.385496,28757.43674,62.366412
104997,4.959857,457,-0.029103,48805.45654,94.617068
104998,4.262404,371,-0.009164,38776.69354,92.479784
104999,5.295376,525,-0.120381,55572.00124,83.809524


In [19]:
gap_pairs = pd.read_csv("data/AK_LE_2.csv")
gap_pairs

Unnamed: 0,AK_#,LE_#
0,0.000000,0.000000
1,0.000212,0.000133
2,0.000094,0.000040
3,0.000137,0.000041
4,0.000098,0.000043
...,...,...
104997,0.000261,0.000087
104998,0.000043,0.000325
104999,0.000181,0.000268
105000,0.000145,0.000120


In [20]:
features = pd.concat([frequencies, phychem, gap_pairs], axis=1)[:105001]
features

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,V,W,Y,isoelectricity,length,hydrophobicity,weight,aliphatic,AK_#,LE_#
0,0.069395,0.008897,0.081851,0.055160,0.039146,0.083630,0.014235,0.062278,0.087189,0.072954,...,0.042705,0.012456,0.067616,4.860335,562.0,-0.510676,62824.47114,72.064057,0.000000,0.000000
1,0.100739,0.003358,0.069174,0.046340,0.034923,0.089993,0.006716,0.040967,0.076561,0.058428,...,0.083949,0.016790,0.047683,5.032207,1489.0,-0.343586,160513.00714,73.183345,0.000212,0.000133
2,0.090598,0.008547,0.046154,0.015385,0.034188,0.099145,0.006838,0.058120,0.047863,0.068376,...,0.059829,0.013675,0.056410,7.378459,585.0,-0.076068,61639.23834,75.743590,0.000094,0.000040
3,0.102216,0.000715,0.064332,0.038599,0.027162,0.071480,0.010007,0.060043,0.064332,0.075768,...,0.075054,0.001430,0.047891,4.722769,1399.0,-0.150322,147316.17644,84.953538,0.000137,0.000041
4,0.117509,0.008226,0.059929,0.023502,0.024677,0.088132,0.015276,0.041128,0.041128,0.057579,...,0.065805,0.024677,0.051704,5.476988,851.0,-0.269330,90706.29954,69.330200,0.000098,0.000043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104996,0.125954,0.007634,0.061069,0.022901,0.061069,0.099237,0.019084,0.030534,0.034351,0.068702,...,0.038168,0.019084,0.038168,10.461661,262.0,-0.385496,28757.43674,62.366412,0.000707,0.000111
104997,0.140044,0.000000,0.061269,0.070022,0.028446,0.089716,0.013129,0.037199,0.006565,0.096280,...,0.098468,0.010941,0.015317,4.959857,457.0,-0.029103,48805.45654,94.617068,0.000261,0.000087
104998,0.086253,0.000000,0.075472,0.061995,0.035040,0.110512,0.013477,0.067385,0.032345,0.075472,...,0.097035,0.002695,0.005391,4.262404,371.0,-0.009164,38776.69354,92.479784,0.000043,0.000325
104999,0.146667,0.015238,0.076190,0.043810,0.032381,0.102857,0.030476,0.064762,0.026667,0.074286,...,0.051429,0.001905,0.022857,5.295376,525.0,-0.120381,55572.00124,83.809524,0.000181,0.000268


In [21]:
def get_xy(metadata, features, only_repr = False, only_not7 = False):
    if only_repr:
        metadata = metadata[metadata.representative]
    if only_not7:
        metadata = metadata[~metadata.is7]
    
    y = metadata.mean_growth_PH.to_numpy()
    X = features.loc[metadata.index].to_numpy()
    return X, y

X_train, y_train = get_xy(train_meta, features, only_not7=True)
X_valid, y_valid = get_xy(valid_meta, features, only_not7=True)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_valid = scaler.transform(X_valid)

In [22]:
X_train.shape, y_train.shape

((28168, 27), (28168,))

In [23]:
X_valid.shape, y_valid.shape

((7042, 27), (7042,))

In [24]:
X_train_repr, y_train_repr = get_xy(train_meta, features, only_not7=True, only_repr=True)
X_valid_repr, y_valid_repr = get_xy(valid_meta, features, only_not7=True, only_repr=True)

X_train_repr = scaler.transform(X_train_repr)
X_valid_repr = scaler.transform(X_valid_repr)

X_train_repr.shape, X_valid_repr.shape

((14209, 27), (3552, 27))

In [26]:
models = [
#     ("SVR C0.1", SVR(C=0.1, epsilon=0.1)),
    ("SVR C1", SVR(C=1.0, epsilon=0.1)),
    ("SVR C10", SVR(C=10.0, epsilon=0.1)),
    ("SVR C50", SVR(C=50.0, epsilon=0.1)),
]

for name, model in models:
    model.fit(X_train_repr, y_train_repr)
    print("fit!")
    train_pred = model.predict(X_train_repr)
    valid_pred = model.predict(X_valid)
    valid_repr_pred = model.predict(X_valid_repr)

    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train_repr)[0], np.sqrt(mean_squared_error(train_pred, y_train_repr))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))
    print("ValRe | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_repr_pred, y_valid_repr)[0], np.sqrt(mean_squared_error(valid_repr_pred, y_valid_repr))))

fit!
>> SVR C1
Train | Spearman 0.6809 RMSE 0.9163
Valid | Spearman 0.5533 RMSE 0.9014
ValRe | Spearman 0.5405 RMSE 1.0121
fit!
>> SVR C10
Train | Spearman 0.8457 RMSE 0.6740
Valid | Spearman 0.5501 RMSE 0.8726
ValRe | Spearman 0.5033 RMSE 1.0179
fit!
>> SVR C50
Train | Spearman 0.9380 RMSE 0.4437
Valid | Spearman 0.4986 RMSE 0.9353
ValRe | Spearman 0.4314 RMSE 1.1114
