In [7]:
import pandas as pd

X = pd.read_parquet("data/X.parquet")
y = pd.read_csv("data/y.csv")

y.rename(columns={'Unnamed: 0': 'Sample',
    'characteristics_ch1.4.er status': 'Er_status'}, inplace=True)
X.rename(columns={'Unnamed: 0': 'Sample'},inplace=True)

y.set_index('Sample', inplace=True)

assert X.index.equals(y.index)

y = y.iloc[:, 0]

Start building the pipelines for the model, first divide the data in training and testing. Try to makee sure that both groups of data have similar Er+ and Er- ratios

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, test_size=0.2, random_state=1)

print("Train:", X_train.shape, "Test:", X_test.shape)
print("")
print("Class balance (train):")
print(y_train.value_counts(normalize=True))
print("")
print("Class balance (test):")
print(y_test.value_counts(normalize=True))

Train: (505, 60607) Test: (127, 60607)

Class balance (train):
Er_status
1.0    0.760396
0.0    0.239604
Name: proportion, dtype: float64

Class balance (test):
Er_status
1.0    0.76378
0.0    0.23622
Name: proportion, dtype: float64


First, set a baseline for the model using dummy classifer. 
Stratified folding makes sure the cross validation groups are balanced when splitting.
We will use ROC-AUC as our model performence metric.

In [9]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

cross_val = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

dummy = DummyClassifier(strategy="most_frequent")
dummy_roc_auc = cross_val_score(dummy, X_train, y_train, cv = cross_val, scoring = "roc_auc")

print(dummy_roc_auc.mean())


0.5


The dummy shows an output of 0.5 which is what is expected in ROC_AUC
A basic logistic regression model should improve this score

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

log_reg = Pipeline(steps=[("Scaler", StandardScaler()),
                         ("model", LogisticRegression(max_iter=10000, class_weight="balanced", random_state = 1))])

log_roc_auc = cross_val_score(log_reg, X_train, y_train, cv = cross_val, scoring = "roc_auc")

print(log_roc_auc.mean())

0.9405012531328321


Test this model on the test data

In [11]:
from sklearn.metrics import roc_auc_score

log_reg.fit(X_train, y_train)
y_prob = log_reg.predict_proba(X_test)[:, 1]
roc_auc_score(y_test, y_prob)

0.9573883161512028