In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
# import XGBoost
from xgboost import XGBClassifier
import os
import itertools
import tqdm


In [2]:
data_dir= '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_finetune_optimization/April_30_Finetune_Data'
local_dir = os.path.expanduser('~/Desktop/saved_models')



In [3]:
X_train = pd.read_csv(os.path.join(data_dir, 'X_finetune_train.csv'),index_col=0)
y_train = pd.read_csv(os.path.join(data_dir, 'y_finetune_train.csv'),index_col=0)

X_val = pd.read_csv(os.path.join(data_dir, 'X_finetune_val.csv'),index_col=0)
y_val = pd.read_csv(os.path.join(data_dir, 'y_finetune_val.csv'),index_col=0)

X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

X_test = pd.read_csv(os.path.join(data_dir, 'X_finetune_test.csv'),index_col=0)
y_test = pd.read_csv(os.path.join(data_dir, 'y_finetune_test.csv'),index_col=0)

In [4]:
# y_col = 'IMDC BINARY'
y_col = 'IMDC ORDINAL'
y_train = y_train[y_col]
y_train.dropna(inplace=True)
X_train = X_train.loc[y_train.index]

y_val = y_val[y_col]
y_val.dropna(inplace=True)
X_val = X_val.loc[y_val.index]

y_trainval = y_trainval[y_col]
y_trainval.dropna(inplace=True)
X_trainval = X_trainval.loc[y_trainval.index]

y_test = y_test[y_col]
y_test.dropna(inplace=True)
X_test = X_test.loc[y_test.index]

In [5]:
y_val.shape

(143,)

In [6]:
X_val.shape

(143, 2736)

In [7]:
logistic_regression_param_grid = {
            # 'penalty': ['l1', 'l2'],
            # 'solver' : ['liblinear'],
            'penalty': ['elasticnet'],
            'solver' : ['saga'],
            # 'l1_ratio': [0, 0.25, 0.33, 0.5, 0.66, 0.75, 1],
            'l1_ratio': [0, 0.33, 0.66, 1],
            # 'C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10],
            'C': [0.001, 0.01, 0.1, 1, 10],
            'class_weight': ['balanced'],
            'max_iter': [10000],
            'tol': [1e-3, 1e-4],
            }

param_grid = logistic_regression_param_grid

param_combs = list(itertools.product(*param_grid.values()))
grid_search = [dict(zip(param_grid.keys(), values)) for values in param_combs]
hp_summary = {}
for i, gs in tqdm.tqdm(enumerate(grid_search)):

    model = LogisticRegression(multi_class='ovr',**gs)
    model.fit(X_train.to_numpy(), y_train.to_numpy())
    y_pred = model.predict(X_val.to_numpy())
    y_pred_proba = model.predict_proba(X_val.to_numpy())
    auc = roc_auc_score(y_val.to_numpy(), y_pred_proba, multi_class='ovr')
    acc = accuracy_score(y_val.to_numpy(), y_pred)
    hp_summary[i] = {'auc': auc, 'acc': acc, 'params': gs}


0it [00:00, ?it/s]

40it [20:56, 31.41s/it]


In [8]:
import json

In [9]:
local_dir

'/Users/jonaheaton/Desktop/saved_models'

In [10]:
with open(os.path.join(local_dir, 'logistic_regression_hp_summary.json'), 'w') as f:
    json.dump(hp_summary, f)

In [11]:
# create a dataframe from the dictionary
hp_summary_df = pd.DataFrame.from_dict(hp_summary, orient='index')

In [12]:
hp_summary_df

Unnamed: 0,auc,acc,params
0,0.759468,0.594406,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
1,0.763992,0.65035,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
2,0.729592,0.552448,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
3,0.747581,0.622378,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
4,0.717965,0.552448,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
5,0.729569,0.587413,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
6,0.713511,0.538462,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
7,0.711653,0.545455,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
8,0.712524,0.538462,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."
9,0.708137,0.538462,"{'penalty': 'elasticnet', 'solver': 'saga', 'l..."


In [13]:
hp_summary_df.to_csv(os.path.join(local_dir, 'logistic_regression_hp_summary.csv'))

In [16]:
# refit the top result from the hp summary

# best_hp = max(hp_summary, key=lambda x: hp_summary[x]['auc'])
# best_params = hp_summary[best_hp]['params']

best_hp = hp_summary_df['auc'].idxmax()
print(f'Best HP: {best_hp}')
best_params = hp_summary_df.loc[best_hp, 'params']
print(f'Best HP: {best_params}')
model = LogisticRegression(multi_class='ovr',**best_params)

model.fit(X_trainval, y_trainval)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)
y_trainval_pred_proba = model.predict_proba(X_trainval)
auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
auc_trainval = roc_auc_score(y_trainval, y_trainval_pred_proba, multi_class='ovr')
acc = accuracy_score(y_test, y_pred)

print(f'Test AUC: {auc}')
print('TrainVal AUC: ', auc_trainval)





Best HP: 1
Best HP: {'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0, 'C': 0.001, 'class_weight': 'balanced', 'max_iter': 10000, 'tol': 0.0001}
Test AUC: 0.7210410499404477
TrainVal AUC:  0.9871827148346838


In [29]:
y_col = 'MSKCC BINARY'


X_train = pd.read_csv(os.path.join(data_dir, 'X_finetune_train.csv'),index_col=0)
y_train = pd.read_csv(os.path.join(data_dir, 'y_finetune_train.csv'),index_col=0)

X_val = pd.read_csv(os.path.join(data_dir, 'X_finetune_val.csv'),index_col=0)
y_val = pd.read_csv(os.path.join(data_dir, 'y_finetune_val.csv'),index_col=0)

X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

X_test = pd.read_csv(os.path.join(data_dir, 'X_finetune_test.csv'),index_col=0)
y_test = pd.read_csv(os.path.join(data_dir, 'y_finetune_test.csv'),index_col=0)


y_train = y_train[y_col]
y_train.dropna(inplace=True)
X_train = X_train.loc[y_train.index]

y_val = y_val[y_col]
y_val.dropna(inplace=True)
X_val = X_val.loc[y_val.index]

y_trainval = y_trainval[y_col]
y_trainval.dropna(inplace=True)
X_trainval = X_trainval.loc[y_trainval.index]

y_test = y_test[y_col]
y_test.dropna(inplace=True)
X_test = X_test.loc[y_test.index]

In [30]:
from xgboost import XGBClassifier

# Instantiate the classifier
clf = XGBClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_val)
y_pred_proba = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)

y_train_pred_proba = clf.predict_proba(X_train)[:,1]
train_auc = roc_auc_score(y_train, y_train_pred_proba)

print('XGBoost')
print(f'Val AUC: {auc}')
# print(f'Train AUC: {train_auc}')

# print(f'Val Acc: {acc}')


# Instantiate the classifier
clf = XGBClassifier()

# Train the classifier
clf.fit(X_trainval, y_trainval)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

y_trainval_pred_proba = clf.predict_proba(X_trainval)[:,1]
train_auc = roc_auc_score(y_trainval, y_trainval_pred_proba)

print(f'Test AUC: {auc}')
# print(f'TrainVal AUC: {train_auc}')

# print(f'Val Acc: {acc}')


XGBoost
Val AUC: 0.830026455026455
Test AUC: 0.9087035909920876


In [31]:
# run random forest

from sklearn.ensemble import RandomForestClassifier

# Instantiate the classifier
clf = RandomForestClassifier()
print('Random Forest')

# Train the classifier
clf.fit(X_train, y_train)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_val)
y_pred_proba = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)

y_train_pred_proba = clf.predict_proba(X_train)[:,1]
train_auc = roc_auc_score(y_train, y_train_pred_proba)


print(f'Val AUC: {auc}')
# print(f'Train AUC: {train_auc}')

# print(f'Val Acc: {acc}')

# Train the classifier
clf.fit(X_trainval, y_trainval)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

y_trainval_pred_proba = clf.predict_proba(X_trainval)[:,1]
train_auc = roc_auc_score(y_trainval, y_trainval_pred_proba)

print(f'Test AUC: {auc}')

Random Forest
Val AUC: 0.8783068783068783
Test AUC: 0.8706634205721242


In [32]:
# Run on SVC
from sklearn.svm import SVC

# Instantiate the classifier
clf = SVC(probability=True)
print('SVC')

# Train the classifier
clf.fit(X_train, y_train)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_val)
y_pred_proba = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)

y_train_pred_proba = clf.predict_proba(X_train)[:,1]
train_auc = roc_auc_score(y_train, y_train_pred_proba)


print(f'Val AUC: {auc}')
# print(f'Train AUC: {train_auc}')

# print(f'Val Acc: {acc}')

# Train the classifier
clf.fit(X_trainval, y_trainval)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

y_trainval_pred_proba = clf.predict_proba(X_trainval)[:,1]
train_auc = roc_auc_score(y_trainval, y_trainval_pred_proba)

print(f'Test AUC: {auc}')

SVC
Val AUC: 0.8756613756613757
Test AUC: 0.9099208764455264


In [33]:
# logistic regression
from sklearn.linear_model import LogisticRegression

# Instantiate the classifier
clf = LogisticRegression()
print('Logistic Regression')

# Train the classifier
# Train the classifier
clf.fit(X_train, y_train)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_val)
y_pred_proba = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)

y_train_pred_proba = clf.predict_proba(X_train)[:,1]
train_auc = roc_auc_score(y_train, y_train_pred_proba)


print(f'Val AUC: {auc}')
# print(f'Train AUC: {train_auc}')

# print(f'Val Acc: {acc}')

# Train the classifier
clf.fit(X_trainval, y_trainval)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

y_trainval_pred_proba = clf.predict_proba(X_trainval)[:,1]
train_auc = roc_auc_score(y_trainval, y_trainval_pred_proba)

print(f'Test AUC: {auc}')

Logistic Regression
Val AUC: 0.8630952380952381
Test AUC: 0.9099208764455265


In [34]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Instantiate the classifier
clf = DecisionTreeClassifier()
print('Decision Tree')

# Train the classifier
clf.fit(X_train, y_train)

# Train the classifier
clf.fit(X_train, y_train)

# Predict the labels of the test set
y_pred = clf.predict(X_val)
y_pred_proba = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, y_pred_proba)
acc = accuracy_score(y_val, y_pred)

y_train_pred_proba = clf.predict_proba(X_train)[:,1]
train_auc = roc_auc_score(y_train, y_train_pred_proba)


print(f'Val AUC: {auc}')
# print(f'Train AUC: {train_auc}')

# print(f'Val Acc: {acc}')

# Train the classifier
clf.fit(X_trainval, y_trainval)

# Predict the labels of the test set
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)[:,1]
auc = roc_auc_score(y_test, y_pred_proba)
acc = accuracy_score(y_test, y_pred)

y_trainval_pred_proba = clf.predict_proba(X_trainval)[:,1]
train_auc = roc_auc_score(y_trainval, y_trainval_pred_proba)

print(f'Test AUC: {auc}')

Decision Tree
Val AUC: 0.7493386243386242
Test AUC: 0.7994522215459525
