In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from scipy.stats import spearmanr
from sklearn.metrics import mean_squared_error


In [2]:
%cd ..

/home/jan/misc/copenhagen-hack/optimal-ph/aux/model-playground


In [3]:
frequencies = pd.read_csv("data/1-mers.tsv", sep="\t")
frequencies

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,M,N,P,Q,R,S,T,V,W,Y
0,0.069395,0.008897,0.081851,0.055160,0.039146,0.083630,0.014235,0.062278,0.087189,0.072954,0.024911,0.069395,0.039146,0.017794,0.019573,0.058719,0.072954,0.042705,0.012456,0.067616
1,0.100739,0.003358,0.069174,0.046340,0.034923,0.089993,0.006716,0.040967,0.076561,0.058428,0.014103,0.061115,0.030893,0.024849,0.022834,0.069174,0.101410,0.083949,0.016790,0.047683
2,0.090598,0.008547,0.046154,0.015385,0.034188,0.099145,0.006838,0.058120,0.047863,0.068376,0.018803,0.061538,0.061538,0.022222,0.013675,0.102564,0.114530,0.059829,0.013675,0.056410
3,0.102216,0.000715,0.064332,0.038599,0.027162,0.071480,0.010007,0.060043,0.064332,0.075768,0.010722,0.054325,0.068620,0.018585,0.012866,0.105790,0.090064,0.075054,0.001430,0.047891
4,0.117509,0.008226,0.059929,0.023502,0.024677,0.088132,0.015276,0.041128,0.041128,0.057579,0.023502,0.070505,0.034078,0.029377,0.025852,0.108108,0.089307,0.065805,0.024677,0.051704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104996,0.125954,0.007634,0.061069,0.022901,0.061069,0.099237,0.019084,0.030534,0.034351,0.068702,0.026718,0.030534,0.053435,0.019084,0.099237,0.091603,0.053435,0.038168,0.019084,0.038168
104997,0.140044,0.000000,0.061269,0.070022,0.028446,0.089716,0.013129,0.037199,0.006565,0.096280,0.017505,0.008753,0.070022,0.021882,0.098468,0.045952,0.070022,0.098468,0.010941,0.015317
104998,0.086253,0.000000,0.075472,0.061995,0.035040,0.110512,0.013477,0.067385,0.032345,0.075472,0.024259,0.026954,0.059299,0.029650,0.043127,0.075472,0.078167,0.097035,0.002695,0.005391
104999,0.146667,0.015238,0.076190,0.043810,0.032381,0.102857,0.030476,0.064762,0.026667,0.074286,0.034286,0.030476,0.066667,0.038095,0.059048,0.041905,0.040000,0.051429,0.001905,0.022857


In [13]:
train_meta = pd.read_csv("../train.csv", index_col=0)
valid_meta = pd.read_csv("../valid.csv", index_col=0)
train_meta

Unnamed: 0,mean_growth_PH,sequence,representative,is7
99973,7.40,MEANHGMNNYIKLAFVFGITTMATSYADTVAPPTLLTAQKLPQLQQ...,True,False
84793,7.00,MEFFKKTALAALVMGFSGAALALPNITILATGGTIAGGGDSATKSN...,False,True
27864,6.50,MSPLGILRRHRVAALLGAALIISPVVVSFAQSANSTGVSKIVATTQ...,True,False
46228,7.80,MKKQYWYVIITYVAMQLSSLVGVPLLAHSGFINASNKDIAISIASG...,False,False
6028,7.00,MSKKKMAITLSAMLSATIIPSFTMDVHAEKKEETKNTKIELENGMT...,False,True
...,...,...,...,...
69002,7.00,MEIIMRNLCFLLTLVATLLLHGRLIAAALPQDEKLITGQLDNGLRY...,False,True
75674,7.00,MSKHPKLLVLALACLACAGRASAAPASDEVARLAQRCAPDVSPLTM...,False,True
50377,7.45,MSRAGSLMLVLGTALWLCGCSGMNSENKRVAPVAEKRPHTMSLHGV...,True,False
87875,7.00,MSAGRLNKKSLGIVMLLSVGLLLAGCSGSKSSDTGTYSGSVYTVKR...,False,True


In [18]:
def get_xy(metadata, features, only_repr = False, only_not7 = False):
    if only_repr:
        metadata = metadata[metadata.representative]
    if only_not7:
        metadata = metadata[~metadata.is7]
    
    y = metadata.mean_growth_PH.to_numpy()
    X = features.loc[metadata.index].to_numpy()
    return X, y

X_train, y_train = get_xy(train_meta, frequencies, only_not7=True)
X_valid, y_valid = get_xy(valid_meta, frequencies, only_not7=True)

In [21]:
X_train.shape, y_train.shape

((28168, 20), (28168,))

In [20]:
X_valid.shape, y_valid.shape

((7042, 20), (7042,))

In [78]:
X_train_repr, y_train_repr = get_xy(train_meta, frequencies, only_not7=True, only_repr=True)
X_valid_repr, y_valid_repr = get_xy(valid_meta, frequencies, only_not7=True, only_repr=True)

X_train_repr.shape, X_valid_repr.shape

((14209, 20), (3552, 20))

In [79]:
models = [
    ("Ridge a0.5", Ridge(alpha=0.5, random_state = 31415)),
    ("Ridge a1", Ridge(alpha=1.0, random_state = 31415)),
    ("Ridge a2", Ridge(alpha=2.0, random_state = 31415)),
    ("RandomForest d2", RandomForestRegressor(max_depth=2, random_state=31415)),
    ("RandomForest d3", RandomForestRegressor(max_depth=3, random_state=31415)),
    ("RandomForest d4", RandomForestRegressor(max_depth=4, random_state=31415)),

]

for name, model in models:
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    valid_pred = model.predict(X_valid)
    valid_repr_pred = model.predict(X_valid_repr)


    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train)[0], np.sqrt(mean_squared_error(train_pred, y_train))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))
    print("ValRe | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_repr_pred, y_valid_repr)[0], np.sqrt(mean_squared_error(valid_repr_pred, y_valid_repr))))

>> Ridge a0.5
Train | Spearman 0.2987 RMSE 1.0255
Valid | Spearman 0.2786 RMSE 1.0118
ValRe | Spearman 0.3202 RMSE 1.1250
>> Ridge a1
Train | Spearman 0.2978 RMSE 1.0262
Valid | Spearman 0.2778 RMSE 1.0120
ValRe | Spearman 0.3216 RMSE 1.1257
>> Ridge a2
Train | Spearman 0.2956 RMSE 1.0280
Valid | Spearman 0.2756 RMSE 1.0129
ValRe | Spearman 0.3222 RMSE 1.1277
>> RandomForest d2
Train | Spearman 0.3021 RMSE 1.0391
Valid | Spearman 0.2926 RMSE 1.0225
ValRe | Spearman 0.3083 RMSE 1.1460
>> RandomForest d3
Train | Spearman 0.3310 RMSE 1.0148
Valid | Spearman 0.3159 RMSE 1.0034
ValRe | Spearman 0.3334 RMSE 1.1263
>> RandomForest d4
Train | Spearman 0.3512 RMSE 0.9917
Valid | Spearman 0.3316 RMSE 0.9863
ValRe | Spearman 0.3393 RMSE 1.1118


In [80]:
models = [
    ("Ridge a0.5", Ridge(alpha=0.5, random_state = 31415)),
    ("Ridge a1", Ridge(alpha=1.0, random_state = 31415)),
    ("Ridge a2", Ridge(alpha=2.0, random_state = 31415)),
    ("RandomForest d2", RandomForestRegressor(max_depth=2, random_state=31415)),
    ("RandomForest d3", RandomForestRegressor(max_depth=3, random_state=31415)),
    ("RandomForest d4", RandomForestRegressor(max_depth=4, random_state=31415)),

]

for name, model in models:
    model.fit(X_train_repr, y_train_repr)
    train_pred = model.predict(X_train_repr)
    valid_pred = model.predict(X_valid)
    valid_repr_pred = model.predict(X_valid_repr)

    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train_repr)[0], np.sqrt(mean_squared_error(train_pred, y_train_repr))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))
    print("ValRe | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_repr_pred, y_valid_repr)[0], np.sqrt(mean_squared_error(valid_repr_pred, y_valid_repr))))

>> Ridge a0.5
Train | Spearman 0.3351 RMSE 1.1216
Valid | Spearman 0.2539 RMSE 1.0259
ValRe | Spearman 0.3251 RMSE 1.1134
>> Ridge a1
Train | Spearman 0.3336 RMSE 1.1233
Valid | Spearman 0.2517 RMSE 1.0251
ValRe | Spearman 0.3245 RMSE 1.1147
>> Ridge a2
Train | Spearman 0.3305 RMSE 1.1277
Valid | Spearman 0.2473 RMSE 1.0256
ValRe | Spearman 0.3222 RMSE 1.1185
>> RandomForest d2
Train | Spearman 0.3284 RMSE 1.1380
Valid | Spearman 0.2893 RMSE 1.0357
ValRe | Spearman 0.3209 RMSE 1.1359
>> RandomForest d3
Train | Spearman 0.3493 RMSE 1.1174
Valid | Spearman 0.3100 RMSE 1.0200
ValRe | Spearman 0.3384 RMSE 1.1189
>> RandomForest d4
Train | Spearman 0.3737 RMSE 1.0949
Valid | Spearman 0.3066 RMSE 1.0064
ValRe | Spearman 0.3419 RMSE 1.1068


In [81]:
models = [
    ("SVR C0.1", SVR(C=0.1, epsilon=0.1)),
    ("SVR C1", SVR(C=1.0, epsilon=0.1)),
    ("SVR C10", SVR(C=10.0, epsilon=0.1)),
    ("SVR C100", SVR(C=100.0, epsilon=0.1)),
]

for name, model in models:
    model.fit(X_train_repr, y_train_repr)
    print("fit!")
    train_pred = model.predict(X_train_repr)
    valid_pred = model.predict(X_valid)
    valid_repr_pred = model.predict(X_valid_repr)

    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train_repr)[0], np.sqrt(mean_squared_error(train_pred, y_train_repr))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))
    print("ValRe | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_repr_pred, y_valid_repr)[0], np.sqrt(mean_squared_error(valid_repr_pred, y_valid_repr))))

fit!
>> SVR C0.1
Train | Spearman 0.4446 RMSE 1.0917
Valid | Spearman 0.4157 RMSE 1.0047
ValRe | Spearman 0.4371 RMSE 1.0914
fit!
>> SVR C1
Train | Spearman 0.5089 RMSE 1.0345
Valid | Spearman 0.4671 RMSE 0.9592
ValRe | Spearman 0.4710 RMSE 1.0570
fit!
>> SVR C10
Train | Spearman 0.5955 RMSE 0.9634
Valid | Spearman 0.4919 RMSE 0.9310
ValRe | Spearman 0.4744 RMSE 1.0409
fit!
>> SVR C100
Train | Spearman 0.7075 RMSE 0.8411
Valid | Spearman 0.4995 RMSE 0.9147
ValRe | Spearman 0.4543 RMSE 1.0545


In [70]:
models = [
    ("SVR e0.05", SVR(C=1.0, epsilon=0.05)),
    ("SVR e0.1", SVR(C=1.0, epsilon=0.1)),
    ("SVR e0.2", SVR(C=1.0, epsilon=0.2))
]

for name, model in models:
    model.fit(X_train_repr, y_train_repr)
    print("fit!")
    train_pred = model.predict(X_train_repr)
    valid_pred = model.predict(X_valid)

    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train_repr)[0], np.sqrt(mean_squared_error(train_pred, y_train_repr))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))

fit!
>> SVR e0.05
Train | Spearman 0.5095 RMSE 1.0346
Valid | Spearman 0.4687 RMSE 0.9593
fit!
>> SVR e0.1
Train | Spearman 0.5089 RMSE 1.0345
Valid | Spearman 0.4671 RMSE 0.9592
fit!
>> SVR e0.2
Train | Spearman 0.5101 RMSE 1.0349
Valid | Spearman 0.4656 RMSE 0.9592


# Training with pH 7 doesn't help 

In [58]:
X_train_w7, y_train_w7 = get_xy(train_meta, frequencies, only_not7=False)

models = [
    ("Ridge a0.5", Ridge(alpha=0.5, random_state = 31415)),
    ("Ridge a1", Ridge(alpha=1.0, random_state = 31415)),
    ("Ridge a2", Ridge(alpha=2.0, random_state = 31415)),
    ("RandomForest d2", RandomForestRegressor(max_depth=2, random_state=31415)),
    ("RandomForest d3", RandomForestRegressor(max_depth=3, random_state=31415)),
    ("RandomForest d4", RandomForestRegressor(max_depth=4, random_state=31415)),

]

for name, model in models:
    model.fit(X_train_w7, y_train_w7)
    train_pred = model.predict(X_train_w7)
    valid_pred = model.predict(X_valid)

    print(">>", name)
    print("Train | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(train_pred, y_train_w7)[0], np.sqrt(mean_squared_error(train_pred, y_train_w7))))
    print("Valid | Spearman {:.4f} RMSE {:.4f}".format(spearmanr(valid_pred, y_valid)[0], np.sqrt(mean_squared_error(valid_pred, y_valid))))

>> Ridge a0.5
Train | Spearman 0.1514 RMSE 0.6234
Valid | Spearman 0.2594 RMSE 1.0400
>> Ridge a1
Train | Spearman 0.1510 RMSE 0.6234
Valid | Spearman 0.2592 RMSE 1.0406
>> Ridge a2
Train | Spearman 0.1504 RMSE 0.6235
Valid | Spearman 0.2586 RMSE 1.0417
>> RandomForest d2
Train | Spearman 0.1096 RMSE 0.6265
Valid | Spearman 0.2748 RMSE 1.0472
>> RandomForest d3
Train | Spearman 0.1194 RMSE 0.6163
Valid | Spearman 0.2952 RMSE 1.0305
>> RandomForest d4
Train | Spearman 0.1300 RMSE 0.6030
Valid | Spearman 0.3132 RMSE 1.0113
