In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas
import gensim

import matplotlib.pyplot as plt
import matplotlib

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

import numpy as np

import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

matplotlib.style.use('ggplot')

## 1. Read and train w2v model

In [3]:
seq_data_file = "../data/sequences_spaces.txt"

In [4]:
sentences = gensim.models.word2vec.LineSentence(seq_data_file)

w2v_dim = 20
model = gensim.models.Word2Vec(sentences, size = w2v_dim, window = 4, workers = 4, min_count = 10)

In [5]:
def seq2vec(seq):
    return model[list(seq)].flatten()

## 2. Read and prepare data

In [6]:
data_file = "../data/bdata.log.txt"

data = pandas.read_csv(data_file, sep = ',')

In [7]:
selected_mhc = 'HLA-A*03:01'

indexes = data['mhc'][data['mhc'] == selected_mhc].index
indexes = data.iloc[indexes]['mhc'][data['peptide_length'] == 9].index
selected_X = list(data['sequence'][indexes].apply(seq2vec))
selected_X = pandas.DataFrame(selected_X)

selected_y = data['meas'][indexes]

selected_X.shape

(5231, 180)

## 3. Learn something

In [8]:
#normalize output
# m = selected_y.mean()
# d = selected_y.max() - selected_y.min()
# selected_y = (selected_y - m) / d

# train/test split
random_number = 122222
X_train, X_test, y_train, y_test = train_test_split(selected_X, selected_y,
                                                    test_size = 0.33, random_state = random_number)

In [9]:
def ridge_regression(X, y, alpha):
    #Fit the model
    ridgereg = Ridge(alpha=alpha,normalize=True)
    ridgereg.fit(X, y)
    y_pred = ridgereg.predict(X).clip(min = 0)
    
    #Return the result in pre-defined format
    rss = sum((y_pred - y)**2)
    return rss, ridgereg.intercept_, ridgereg

In [10]:
alpha_ridge = [0.001, 0.02, 0.03, 0.04, 0.05,0.06, 0.07, 0.08, 0.09, 0.1]

rss = [0] * 10
intercept = [0] * 10
predictors = [0] * 10

for i in range(10):
    rss[i], intercept[i], predictors[i] = ridge_regression(X_train, y_train, alpha_ridge[i])

In [11]:
# predict test data

test_rss = [0] * 10
for i in range(10):
    m_pred = predictors[i]
    test_pred = m_pred.predict(X_test).clip(min = 0)
    test_rss[i] = sum((test_pred - y_test)**2)
    
    print '-' * 10
    print "alpha = ", alpha_ridge[i]
    print "train rss = ", rss[i]
    print "test rss = ", test_rss[i]


m_pred = predictors[0]
t_pred = m_pred.predict(X_test)

----------
alpha =  0.001
train rss =  85.338397909
test rss =  45.1674529172
----------
alpha =  0.02
train rss =  87.9453577513
test rss =  46.4631086812
----------
alpha =  0.03
train rss =  88.6244696465
test rss =  46.8297443455
----------
alpha =  0.04
train rss =  89.2255203539
test rss =  47.1465607452
----------
alpha =  0.05
train rss =  89.7787053698
test rss =  47.4282835546
----------
alpha =  0.06
train rss =  90.2956064045
test rss =  47.6878320983
----------
alpha =  0.07
train rss =  90.7807748679
test rss =  47.9278375591
----------
alpha =  0.08
train rss =  91.2439173339
test rss =  48.1515405094
----------
alpha =  0.09
train rss =  91.6887055316
test rss =  48.3639328321
----------
alpha =  0.1
train rss =  92.1186360765
test rss =  48.5651634263


In [12]:
t_pred = pandas.DataFrame(t_pred)
t_pred[t_pred < 0] = 0
t_pred.describe()

Unnamed: 0,0
count,1727.0
mean,0.253355
std,0.189435
min,0.0
25%,0.087506
50%,0.227312
75%,0.408371
max,0.756517


In [13]:
t_pred = pandas.DataFrame(t_pred)

and so on...

In [14]:
y_test = pandas.DataFrame(y_test)

In [15]:
t_pred['1'] = np.array(y_test)

In [16]:
t_pred.corr()

Unnamed: 0,0,1
0,1.0,0.774791
1,0.774791,1.0
