In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas
import gensim

import matplotlib.pyplot as plt
import matplotlib

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

import numpy as np

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout

matplotlib.style.use('ggplot')

Using Theano backend.


## 1. Read data

In [3]:
data_file = "../data/bdata.log.txt"

data = pandas.read_csv(data_file, sep = ',')

In [4]:
feats_file = "../data/kidera.txt"

feats = pandas.read_csv(feats_file, sep = '\t')
feats = feats.transpose()

d = dict()
for i in range(20):
    d[feats[i][0]] = feats[i][1:]
    
def seq2vec(seq):
    answer = []
    for i in list(seq):
        answer.append(list(d[i]))
    return np.array(answer, dtype=float).flatten()

## 2. Prepare data subset

In [5]:
selected_mhc = 'HLA-A*03:01'

indexes = data['mhc'][data['mhc'] == selected_mhc].index
indexes = data.iloc[indexes]['mhc'][data['peptide_length'] == 9].index
selected_X = list(data['sequence'][indexes].apply(seq2vec))
selected_X = pandas.DataFrame(selected_X)

selected_y = data['meas'][indexes]

selected_X.shape

(5231, 90)

## 3. Read linear regression

In [6]:
#normalize output
# m = selected_y.mean()
# d = selected_y.max() - selected_y.min()
# selected_y = (selected_y - m) / d

# train/test split
np.random.seed(42)
random_number = 122222
X_train, X_test, y_train, y_test = train_test_split(selected_X, selected_y,
                                                    test_size = 0.33, random_state = random_number)

In [7]:
def ridge_regression(X, y, alpha):
    #Fit the model
    ridgereg = Ridge(alpha=alpha,normalize=True)
    ridgereg.fit(X, y)
    y_pred = ridgereg.predict(X).clip(min = 0)
    
    #Return the result in pre-defined format
    rss = sum((y_pred - y)**2)
    return rss, ridgereg.intercept_, ridgereg

In [8]:
alpha_ridge = [0.001, 0.02, 0.03, 0.04, 0.05,0.06, 0.07, 0.08, 0.09, 0.1]

rss = [0] * 10
intercept = [0] * 10
predictors = [0] * 10

for i in range(10):
    rss[i], intercept[i], predictors[i] = ridge_regression(X_train, y_train, alpha_ridge[i])

In [9]:
# predict test data

test_rss = [0] * 10
for i in range(10):
    m_pred = predictors[i]
    test_pred = m_pred.predict(X_test).clip(min = 0)
    test_rss[i] = sum((test_pred - y_test)**2)
    
    print '-' * 10
    print "alpha = ", alpha_ridge[i]
    print "train rss = ", rss[i]
    print "test rss = ", test_rss[i]


m_pred = predictors[0]
t_pred = m_pred.predict(X_test)

----------
alpha =  0.001
train rss =  104.759488394
test rss =  54.9711946689
----------
alpha =  0.02
train rss =  105.149613111
test rss =  55.1434977264
----------
alpha =  0.03
train rss =  105.368670695
test rss =  55.2401487582
----------
alpha =  0.04
train rss =  105.593057141
test rss =  55.3377607594
----------
alpha =  0.05
train rss =  105.820310915
test rss =  55.4350581046
----------
alpha =  0.06
train rss =  106.049201559
test rss =  55.533919791
----------
alpha =  0.07
train rss =  106.280048234
test rss =  55.6350379361
----------
alpha =  0.08
train rss =  106.512461634
test rss =  55.734073753
----------
alpha =  0.09
train rss =  106.746590582
test rss =  55.8335095624
----------
alpha =  0.1
train rss =  106.981974536
test rss =  55.9342922106


In [10]:
t_pred = pandas.DataFrame(t_pred)
t_pred[t_pred < 0] = 0
t_pred.describe()

Unnamed: 0,0
count,1727.0
mean,0.25277
std,0.176585
min,0.0
25%,0.108666
50%,0.229794
75%,0.392818
max,0.706367


In [11]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

y_train = y_train.as_matrix().flatten()
y_test = y_test.as_matrix().flatten()

In [12]:
model = Sequential()
model.add(Dense(16, input_dim=X_train.shape[1]))
model.add(Activation('relu'))
model.add(Dropout(0.15))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dropout(0.15))
model.add(Dense(1))

model.compile(loss='mse', optimizer='rmsprop')

In [13]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=500, verbose=2, validation_split=0.15)

Train on 2978 samples, validate on 526 samples
Epoch 1/500
0s - loss: 0.1951 - val_loss: 0.0759
Epoch 2/500
0s - loss: 0.0920 - val_loss: 0.0612
Epoch 3/500
0s - loss: 0.0664 - val_loss: 0.0529
Epoch 4/500
0s - loss: 0.0540 - val_loss: 0.0454
Epoch 5/500
0s - loss: 0.0464 - val_loss: 0.0415
Epoch 6/500
0s - loss: 0.0410 - val_loss: 0.0388
Epoch 7/500
0s - loss: 0.0390 - val_loss: 0.0340
Epoch 8/500
0s - loss: 0.0360 - val_loss: 0.0320
Epoch 9/500
0s - loss: 0.0348 - val_loss: 0.0332
Epoch 10/500
0s - loss: 0.0330 - val_loss: 0.0324
Epoch 11/500
0s - loss: 0.0332 - val_loss: 0.0318
Epoch 12/500
0s - loss: 0.0318 - val_loss: 0.0303
Epoch 13/500
0s - loss: 0.0297 - val_loss: 0.0305
Epoch 14/500
0s - loss: 0.0289 - val_loss: 0.0300
Epoch 15/500
0s - loss: 0.0284 - val_loss: 0.0294
Epoch 16/500
0s - loss: 0.0281 - val_loss: 0.0292
Epoch 17/500
0s - loss: 0.0276 - val_loss: 0.0278
Epoch 18/500
0s - loss: 0.0272 - val_loss: 0.0289
Epoch 19/500
0s - loss: 0.0267 - val_loss: 0.0284
Epoch 20/500

<keras.callbacks.History at 0x10e72f810>

In [14]:
preds = model.predict(X_test, verbose=0).flatten()

In [15]:
sum((preds - y_test)**2)

59.227897196611437