In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas
import gensim

import matplotlib.pyplot as plt
import matplotlib

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing

import numpy as np

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout

matplotlib.style.use('ggplot')

Using Theano backend.


## 1. Read and train w2v model

In [3]:
seq_data_file = "../data/sequences_spaces.txt"

In [4]:
sentences = gensim.models.word2vec.LineSentence(seq_data_file)

w2v_dim = 20
model = gensim.models.Word2Vec(sentences, size = w2v_dim, window = 4, workers = 4, min_count = 10)

In [5]:
def seq2vec(seq):
    return model[list(seq)].flatten()

## 2. Read and prepare data

In [6]:
data_file = "../data/bdata.log.txt"

data = pandas.read_csv(data_file, sep = ',')

In [7]:
selected_mhc = 'HLA-A*03:01'

indexes = data['mhc'][data['mhc'] == selected_mhc].index
indexes = data.iloc[indexes]['mhc'][data['peptide_length'] == 9].index
selected_X = list(data['sequence'][indexes].apply(seq2vec))
selected_X = pandas.DataFrame(selected_X)

selected_y = data['meas'][indexes]

selected_X.shape

(5231, 180)

## 3. Learn linear regression

In [8]:
#normalize output
# m = selected_y.mean()
# d = selected_y.max() - selected_y.min()
# selected_y = (selected_y - m) / d

# train/test split
np.random.seed(42)
random_number = 122222
X_train, X_test, y_train, y_test = train_test_split(selected_X, selected_y,
                                                    test_size = 0.33, random_state = random_number)

In [9]:
def ridge_regression(X, y, alpha):
    #Fit the model
    ridgereg = Ridge(alpha=alpha,normalize=True)
    ridgereg.fit(X, y)
    y_pred = ridgereg.predict(X).clip(min = 0)
    
    #Return the result in pre-defined format
    rss = sum((y_pred - y)**2)
    return rss, ridgereg.intercept_, ridgereg

In [10]:
alpha_ridge = [0.001, 0.02, 0.03, 0.04, 0.05,0.06, 0.07, 0.08, 0.09, 0.1]

rss = [0] * 10
intercept = [0] * 10
predictors = [0] * 10

for i in range(10):
    rss[i], intercept[i], predictors[i] = ridge_regression(X_train, y_train, alpha_ridge[i])

In [11]:
# predict test data

test_rss = [0] * 10
for i in range(10):
    m_pred = predictors[i]
    test_pred = m_pred.predict(X_test).clip(min = 0)
    test_rss[i] = sum((test_pred - y_test)**2)
    
    print '-' * 10
    print "alpha = ", alpha_ridge[i]
    print "train rss = ", rss[i]
    print "test rss = ", test_rss[i]


m_pred = predictors[0]
t_pred = m_pred.predict(X_test)

----------
alpha =  0.001
train rss =  84.6723914115
test rss =  45.1025293294
----------
alpha =  0.02
train rss =  87.0779610408
test rss =  46.1144116012
----------
alpha =  0.03
train rss =  87.8928301128
test rss =  46.4858464899
----------
alpha =  0.04
train rss =  88.5999698223
test rss =  46.8123982348
----------
alpha =  0.05
train rss =  89.2346915781
test rss =  47.109615158
----------
alpha =  0.06
train rss =  89.8225503023
test rss =  47.3812404508
----------
alpha =  0.07
train rss =  90.3743820766
test rss =  47.6369403767
----------
alpha =  0.08
train rss =  90.8945031123
test rss =  47.8791256244
----------
alpha =  0.09
train rss =  91.3901283189
test rss =  48.1083877819
----------
alpha =  0.1
train rss =  91.8661487291
test rss =  48.329127813


In [12]:
t_pred = pandas.DataFrame(t_pred)
t_pred[t_pred < 0] = 0
t_pred.describe()

Unnamed: 0,0
count,1727.0
mean,0.253492
std,0.190096
min,0.0
25%,0.088059
50%,0.227959
75%,0.410966
max,0.758167


In [13]:
t_pred = pandas.DataFrame(t_pred)

and so on...

In [14]:
y_test = pandas.DataFrame(y_test)

In [15]:
t_pred['1'] = np.array(y_test)

In [16]:
t_pred.corr()

Unnamed: 0,0,1
0,1.0,0.775075
1,0.775075,1.0


## 4. NN time

In [17]:
X_train = X_train.as_matrix()
X_test = X_test.as_matrix()

y_train = y_train.as_matrix().flatten()
y_test = y_test.as_matrix().flatten()

In [26]:
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1]))
model.add(Activation('relu'))
model.add(Dropout(0.15))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.15))
model.add(Dense(1))

model.compile(loss='mse', optimizer='rmsprop')

In [27]:
model.fit(X_train, y_train, batch_size=32, nb_epoch=500, verbose=2, validation_split=0.15)

Train on 2978 samples, validate on 526 samples
Epoch 1/500
0s - loss: 0.0989 - val_loss: 0.0792
Epoch 2/500
0s - loss: 0.0467 - val_loss: 0.0337
Epoch 3/500
0s - loss: 0.0390 - val_loss: 0.0507
Epoch 4/500
0s - loss: 0.0355 - val_loss: 0.0499
Epoch 5/500
0s - loss: 0.0324 - val_loss: 0.0297
Epoch 6/500
0s - loss: 0.0299 - val_loss: 0.0264
Epoch 7/500
0s - loss: 0.0279 - val_loss: 0.0281
Epoch 8/500
0s - loss: 0.0253 - val_loss: 0.0257
Epoch 9/500
0s - loss: 0.0250 - val_loss: 0.0458
Epoch 10/500
0s - loss: 0.0230 - val_loss: 0.0244
Epoch 11/500
0s - loss: 0.0213 - val_loss: 0.0258
Epoch 12/500
0s - loss: 0.0204 - val_loss: 0.0415
Epoch 13/500
0s - loss: 0.0203 - val_loss: 0.0411
Epoch 14/500
0s - loss: 0.0185 - val_loss: 0.0235
Epoch 15/500
0s - loss: 0.0184 - val_loss: 0.0256
Epoch 16/500
0s - loss: 0.0168 - val_loss: 0.0246
Epoch 17/500
0s - loss: 0.0172 - val_loss: 0.0322
Epoch 18/500
0s - loss: 0.0160 - val_loss: 0.0344
Epoch 19/500
0s - loss: 0.0159 - val_loss: 0.0327
Epoch 20/500

<keras.callbacks.History at 0x10dfd0a10>

In [28]:
preds = model.predict(X_test, verbose=0).flatten()

In [29]:
sum((preds - y_test)**2)

44.869361259006105