In [123]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import os
import torch
import random
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import Image

SEED = 24535


def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    # torch.backends.cudnn.deterministic = True


seed_everything()

In [124]:
pd.set_option('display.max_rows', 500)


In [2]:
train = pd.read_csv('data_set_ALL_AML_train.csv')


In [4]:
test = pd.read_csv('data_set_ALL_AML_independent.csv')


In [57]:
train.shape, test.shape


((7129, 78), (7129, 70))

In [71]:
def get_person_columns(df):
    return [c for c in df.columns if c.isdigit()]


In [72]:
target_orig = pd.read_csv('actual.csv')
target_orig.shape


(72, 2)

In [73]:
train_expressions_cols = get_person_columns(
    train)  # list(map(str,range(1,38)))
test_expressions_cols = get_person_columns(
    test)  # list(map(str,range(39, 63)))
# expressions_cols


In [75]:
# train_expressions_cols


In [77]:
# train['Gene Accession Number']


In [78]:
train_exp = pd.DataFrame(train[train_expressions_cols]).set_index(
    train['Gene Accession Number'])
test_exp = pd.DataFrame(test[test_expressions_cols]).set_index(
    test['Gene Accession Number'])


In [79]:
import plotly.graph_objects as go

# fig = go.Figure(data=[go.Histogram(x=exp.values.reshape(-1))])
# fig.show()


In [150]:
X_train = train_exp.rename(
    {c: f"person{c}" for c in train_exp.columns}, axis='columns').T.sample(frac=1)
X_test = test_exp.rename(
    {c: f"person{c}" for c in test_exp.columns}, axis='columns').T
val_size = 16
X_val = X_train[-val_size:]
X_train = X_train[:-val_size]


In [151]:
X_train.shape, X_val.shape, X_test.shape


((22, 7129), (16, 7129), (34, 7129))

In [152]:
target = pd.Series(target_orig.cancer.values, index=[
                   f"person{n}" for n in target_orig.patient]).rename("cancer")
target_one_hot = (target == "ALL").astype(int)
target_one_hot.value_counts()
Y_train = target_one_hot.loc[X_train.index]
Y_val = target_one_hot.loc[X_val.index]
Y_test = target_one_hot.loc[X_test.index]


In [173]:
import phik
from phik import resources, report

In [203]:
phik_cors = pd.concat([X_train, Y_train], axis=1).phik_matrix()['cancer'].iloc[:-1]
#[['M84526_at']]

interval columns not set, guessing: ['M84526_at', 'M96326_rna1_at', 'M62762_at', 'U50136_rna1_at', 'M69043_at', 'X04085_rna1_at', 'M27783_s_at', 'M55150_at', 'M16038_at', 'Y00787_s_at', 'M11147_at', 'L19779_at', 'M23197_at', 'L08246_at', 'X95735_at', 'M98399_s_at', 'D10495_at', 'M20203_s_at', 'M83221_at', 'X70297_at', 'M28209_at', 'M57710_at', 'U46751_at', 'D49950_at', 'M27891_at', 'M22960_at', 'U82759_at', 'M63138_at', 'M81695_s_at', 'Z32765_at', 'M32304_s_at', 'D43682_s_at', 'Y12670_at', 'X17042_at', 'M28130_rna1_s_at', 'M19045_f_at', 'L42379_at', 'L20941_at', 'X58431_rna2_s_at', 'U13666_at', 'X14008_rna1_f_at', 'HG1879-HT1919_at', 'M80899_at', 'L09235_at', 'J04027_at', 'J03801_f_at', 'L20316_at', 'X06985_at', 'U05572_s_at', 'M20681_at', 'D87433_at', 'M75715_s_at', 'U40369_rna1_at', 'D89052_at', 'X80907_at', 'X57579_s_at', 'X62654_rna1_at', 'X75042_at', 'L09717_at', 'D50310_at', 'X07730_at', 'D26579_at', 'M33684_s_at', 'M28713_at', 'D87953_at', 'D87116_at', 'HG2724-HT2820_at', 'L1166

In [204]:
phik_cors.sort_values()

L11669_at           0.519983
S82185_at           0.555827
U33821_at           0.595005
D26579_at           0.597783
M20681_at           0.661554
J03801_f_at         0.672479
J04027_at           0.678940
M63138_at           0.690348
D38128_at           0.691748
X75042_at           0.693756
L09717_at           0.704138
M75715_s_at         0.709094
U13666_at           0.709094
U41767_s_at         0.714770
X06985_at           0.716646
M28130_rna1_s_at    0.719505
D43682_s_at         0.720489
M28209_at           0.720489
M14159_cds2_at      0.727936
X80907_at           0.734069
M16038_at           0.734357
L13278_at           0.745740
M32304_s_at         0.745740
M11147_at           0.757116
U40369_rna1_at      0.777642
X57579_s_at         0.779847
U05572_s_at         0.782389
L42379_at           0.787682
D50918_at           0.791194
D26156_s_at         0.802460
M95178_at           0.811808
HG2689-HT2785_at    0.820732
D49950_at           0.827109
M81695_s_at         0.836503
K01383_at     

In [205]:
phik_imp_features = phik_cors[phik_cors > 0.75].index
phik_imp_features

Index(['M84526_at', 'M96326_rna1_at', 'M62762_at', 'U50136_rna1_at',
       'M69043_at', 'X04085_rna1_at', 'M27783_s_at', 'M55150_at',
       'Y00787_s_at', 'M11147_at', 'L19779_at', 'M23197_at', 'L08246_at',
       'X95735_at', 'M98399_s_at', 'D10495_at', 'M20203_s_at', 'M83221_at',
       'X70297_at', 'M57710_at', 'U46751_at', 'D49950_at', 'M27891_at',
       'M22960_at', 'U82759_at', 'M81695_s_at', 'Z32765_at', 'Y12670_at',
       'X17042_at', 'M19045_f_at', 'L42379_at', 'L20941_at',
       'X58431_rna2_s_at', 'X14008_rna1_f_at', 'HG1879-HT1919_at', 'M80899_at',
       'L09235_at', 'L20316_at', 'U05572_s_at', 'D87433_at', 'U40369_rna1_at',
       'D89052_at', 'X57579_s_at', 'X62654_rna1_at', 'D50310_at', 'X07730_at',
       'M33684_s_at', 'M28713_at', 'D87953_at', 'D87116_at',
       'HG2724-HT2820_at', 'M21551_rna1_at', 'M95178_at', 'K01383_at',
       'AF009426_at', 'L02547_at', 'L47738_at', 'M31523_at', 'D50918_at',
       'U22376_cds2_s_at', 'X52142_at', 'D26156_s_at', 'J04615_a

In [154]:
cors = X_train.apply(lambda x: x.corr(target_one_hot))


In [155]:
# persons.join(target)


In [156]:
cors = cors.sort_values()
cors


Gene Accession Number
M84526_at          -0.919101
M96326_rna1_at     -0.900047
M62762_at          -0.895991
U50136_rna1_at     -0.886066
M69043_at          -0.876101
                      ...   
D26156_s_at         0.759177
J04615_at           0.764558
D38128_at           0.768772
HG2689-HT2785_at    0.778916
U37055_rna1_s_at    0.792280
Length: 7129, dtype: float64

In [157]:
important_features = cors[cors.abs() > 0.7].index


In [158]:
# cors.iloc[:30]


In [159]:
# cors.iloc[-15:]


In [160]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [206]:
X_train = X_train[phik_imp_features]
X_val = X_val[phik_imp_features]


In [207]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
predicted = reg.predict(X_val)


In [208]:
predicted


array([1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0])

In [209]:
from sklearn.metrics import classification_report


In [210]:
print(classification_report(Y_val, predicted))


              precision    recall  f1-score   support

           0       0.80      0.80      0.80         5
           1       0.91      0.91      0.91        11

    accuracy                           0.88        16
   macro avg       0.85      0.85      0.85        16
weighted avg       0.88      0.88      0.88        16

