In [123]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import os
import torch
import random
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
from IPython.display import Image

SEED = 24535


def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.cuda.manual_seed_all(seed)
    # torch.backends.cudnn.deterministic = True


seed_everything()

In [124]:
pd.set_option('display.max_rows', 500)


In [2]:
train = pd.read_csv('data_set_ALL_AML_train.csv')


In [4]:
test = pd.read_csv('data_set_ALL_AML_independent.csv')


In [57]:
train.shape, test.shape


((7129, 78), (7129, 70))

In [71]:
def get_person_columns(df):
    return [c for c in df.columns if c.isdigit()]


In [72]:
target_orig = pd.read_csv('actual.csv')
target_orig.shape


(72, 2)

In [73]:
train_expressions_cols = get_person_columns(
    train)  # list(map(str,range(1,38)))
test_expressions_cols = get_person_columns(
    test)  # list(map(str,range(39, 63)))
# expressions_cols


In [75]:
# train_expressions_cols


In [77]:
# train['Gene Accession Number']


In [78]:
train_exp = pd.DataFrame(train[train_expressions_cols]).set_index(
    train['Gene Accession Number'])
test_exp = pd.DataFrame(test[test_expressions_cols]).set_index(
    test['Gene Accession Number'])


In [79]:
import plotly.graph_objects as go

# fig = go.Figure(data=[go.Histogram(x=exp.values.reshape(-1))])
# fig.show()


In [150]:
X_train = train_exp.rename(
    {c: f"person{c}" for c in train_exp.columns}, axis='columns').T.sample(frac=1)
X_test = test_exp.rename(
    {c: f"person{c}" for c in test_exp.columns}, axis='columns').T
val_size = 16
X_val = X_train[-val_size:]
X_train = X_train[:-val_size]


In [151]:
X_train.shape, X_val.shape, X_test.shape


((22, 7129), (16, 7129), (34, 7129))

In [152]:
target = pd.Series(target_orig.cancer.values, index=[
                   f"person{n}" for n in target_orig.patient]).rename("cancer")
target_one_hot = (target == "ALL").astype(int)
target_one_hot.value_counts()
Y_train = target_one_hot.loc[X_train.index]
Y_val = target_one_hot.loc[X_val.index]
Y_test = target_one_hot.loc[X_test.index]


In [153]:
Y_val

person35    0
person23    1
person2     1
person7     1
person21    1
person28    0
person26    1
person20    1
person25    1
person33    0
person29    0
person22    1
person4     1
person3     1
person12    1
person32    0
Name: cancer, dtype: int64

In [154]:
cors = X_train.apply(lambda x: x.corr(target_one_hot))


In [155]:
# persons.join(target)


In [156]:
cors = cors.sort_values()
cors


Gene Accession Number
M84526_at          -0.919101
M96326_rna1_at     -0.900047
M62762_at          -0.895991
U50136_rna1_at     -0.886066
M69043_at          -0.876101
                      ...   
D26156_s_at         0.759177
J04615_at           0.764558
D38128_at           0.768772
HG2689-HT2785_at    0.778916
U37055_rna1_s_at    0.792280
Length: 7129, dtype: float64

In [157]:
important_features = cors[cors.abs() > 0.7].index


In [158]:
# cors.iloc[:30]


In [159]:
# cors.iloc[-15:]


In [160]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [161]:
X_train = X_train


In [162]:
reg = LogisticRegression()
reg.fit(X_train, Y_train)
predicted = reg.predict(X_val)


In [163]:
predicted


array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0])

In [164]:
from sklearn.metrics import classification_report


In [165]:
print(classification_report(Y_val, predicted))


              precision    recall  f1-score   support

           0       1.00      0.80      0.89         5
           1       0.92      1.00      0.96        11

    accuracy                           0.94        16
   macro avg       0.96      0.90      0.92        16
weighted avg       0.94      0.94      0.94        16

