In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
import pickle

In [2]:
call = pd.read_csv("../data/Train_call.txt", sep='\t')

call

Unnamed: 0,Chromosome,Start,End,Nclone,Array.129,Array.34,Array.67,Array.24,Array.22,Array.36,...,Array.64,Array.89,Array.30,Array.35,Array.93,Array.10,Array.123,Array.100,Array.134,Array.130
0,1,2927,43870,3,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,-1,0
1,1,85022,216735,4,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,-1,0
2,1,370546,372295,4,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,-1,0
3,1,471671,786483,5,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,-1,0
4,1,792533,907406,13,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,23,153062077,153452633,57,1,1,1,0,1,1,...,1,1,1,1,1,1,1,1,1,1
2830,23,153466463,153491568,4,1,1,1,0,1,1,...,2,1,1,1,1,1,1,1,1,1
2831,23,153504394,153933426,55,1,1,1,0,1,1,...,2,1,1,1,1,1,1,1,1,1
2832,23,153938998,153989329,5,1,1,1,0,1,1,...,2,1,1,1,1,1,1,1,1,1


In [3]:
clin = pd.read_csv("../data/Train_clinical.txt", sep='\t')

clin

Unnamed: 0,Sample,Subgroup
0,Array.129,HER2+
1,Array.34,HR+
2,Array.67,HR+
3,Array.24,Triple Neg
4,Array.22,Triple Neg
...,...,...
95,Array.10,HER2+
96,Array.123,HR+
97,Array.100,HR+
98,Array.134,HR+


In [4]:
X=call[clin[clin['Subgroup']!='HER2+']['Sample']].T

X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2824,2825,2826,2827,2828,2829,2830,2831,2832,2833
Array.34,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.67,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.24,0,0,0,0,0,0,0,-1,0,0,...,0,0,0,0,0,0,0,0,0,0
Array.22,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.36,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Array.35,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.93,0,0,1,1,1,1,1,1,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.123,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Array.100,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [5]:
y = clin[clin['Subgroup']!='HER2+'].set_index('Sample')['Subgroup']

y

Sample
Array.34            HR+
Array.67            HR+
Array.24     Triple Neg
Array.22     Triple Neg
Array.36            HR+
                ...    
Array.35            HR+
Array.93     Triple Neg
Array.123           HR+
Array.100           HR+
Array.134           HR+
Name: Subgroup, Length: 68, dtype: object

In [6]:
folds_file = '../data/folds.pickle'
with open(folds_file, 'rb') as fid:
    outer_cross_val = pickle.load(fid)

In [22]:
X_trainval_list, y_trainval_list = [], []
X_test_list, y_test_list = [], []

for split in outer_cross_val:
    inner_cross_val, test_index = split
    n_samples = X.shape[0]
    mask = np.ones(n_samples).astype('bool')
    mask[test_index]=False
    X_trainval, y_trainval = X.iloc[mask,:], y[mask]
    X_test, y_test = X.iloc[test_index,:], y[test_index]

    X_trainval_list.append(X_trainval)
    y_trainval_list.append(y_trainval)

    X_test_list.append(X_test)
    y_test_list.append(y_test)

In [35]:
model = LogisticRegression(penalty='l1',solver='saga', max_iter=10000,random_state=12345)

accuracy_list = []
av_logloss_list = []

for (X_train, y_train, X_val, y_val) in zip(X_trainval_list, y_trainval_list, X_test_list, y_test_list):
    model.fit(X_train, np.float32(y_train=='HR+'))
    y_pred = model.predict_proba(X_val)[:,1]
    predictions = [round(value) for value in y_pred]
    accuracy = accuracy_score(y_val=='HR+', predictions)
    av_logloss = log_loss(y_true=y_val=='HR+',labels=[True,False], y_pred=y_pred)

    accuracy_list.append(accuracy)
    av_logloss_list.append(av_logloss)

Results for the ridge regression model
Accuracy mean: 0.8119047619047619, std: 0.12797480619406368
Logloss mean: 0.42312253878114775, std: 0.19352284208991855


In [36]:
print('Results for the ridge regression model')
print('Accuracy mean: {}, std: {}'.format(np.median(accuracy_list),np.std(accuracy_list)))
print('Logloss mean: {}, std: {}'.format(np.mean(av_logloss_list),np.std(av_logloss_list)))

Results for the ridge regression model
Accuracy mean: 0.8452380952380952, std: 0.12797480619406368
Logloss mean: 0.42312253878114775, std: 0.19352284208991855
