In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, LeaveOneOut, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

# Read in the data

In [2]:
datafile = "./rosetta_data_for_regression_avg.csv"
df = pd.read_csv(datafile)

In [3]:
mutant_names = df.variant_label.values
X = df.iloc[:,:18].values
y = df.exp_ddG.values

# Models

1. Try fitting with Columns 6-18
2. Try fitting with Columns 1-18
3. Try fitting with Columns 1-5 
4. And any other automated feature selection you want to test (by excluding any number of features to get the best fit). We can later look at the terms included or relative importance values and try to make sense of them.


## Linear Model

In [4]:
steps = [('scaler', MinMaxScaler()), ('reg', ElasticNet(alpha = 0.008))]

In [5]:
pipe = Pipeline(steps)

In [6]:
cv = LeaveOneOut()
# enumerate splits
y_true, y_pred = list(), list()
for train_ix, test_ix in cv.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # fit model
    model = pipe
    model.fit(X_train, y_train)
    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.append(y_test[0])
    y_pred.append(yhat[0])
# calculate accuracy
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print('MSE: %.3f' % mse)
print('R2: %.3f' % r2)

MSE: 7.461
R2: 0.099


## Non-linear Model

In [7]:
from sklearn.neural_network import MLPRegressor

In [8]:
steps = [('scaler', StandardScaler()), ('reg', MLPRegressor(activation='logistic', hidden_layer_sizes=(13, 7, 5), solver='sgd', max_iter=500))]
pipe = Pipeline(steps)

In [9]:
cv = KFold(n_splits=5)
# enumerate splits
y_true, y_pred = list(), list()
for train_ix, test_ix in cv.split(X):
    # split data
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]
    # fit model
    model = pipe
    model.fit(X_train, y_train)
    # evaluate model
    yhat = model.predict(X_test)
    # store
    y_true.extend(y_test)
    y_pred.extend(yhat)
# calculate accuracy
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print('MSE: %.3f' % mse)
print('R2: %.3f' % r2)

MSE: 8.756
R2: -0.057
