In [3]:
import matplotlib
matplotlib.use('Agg')

# Handle table-like data and matrices
import numpy as np
import pandas as pd

# Helpers
import sys
sys.path.insert(0,'../../')
from utils import data_path,results_path,grid_search,estimator_result,cross_validate,evaluate_param
from scipy.sparse import csr_matrix,save_npz,load_npz
from sklearn.model_selection import cross_val_score,LeaveOneOut,StratifiedKFold
from sklearn.metrics import accuracy_score,roc_auc_score,roc_curve
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler,StandardScaler
import pickle

# Feature selection
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV,VarianceThreshold

# Algorithm
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Load data

In [11]:
train = pd.read_pickle(data_path + 'feature_selection/' + 'per/'+'df_109_multi.pkl')
test = pd.read_pickle(data_path + 'feature_selection/' + 'per/'+'df_test_109_multi.pkl')
train_y = pd.read_csv(data_path + 'train_Y.csv')['CMV_status']
test_y = pd.read_csv(data_path + 'test_Y.csv')['CMV_status']

In [12]:
# new_train = (train - train.min()) / (train.max() - train.min())
# # train.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))  
# new_test = (test - test.min()) / (test.max() - test.min())
# new_test.fillna(0,inplace=True)
# na_TCRs = new_test.columns[new_test.isna().any()].tolist()

## Define random forest classifier

In [17]:
rf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=31, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=5,
                min_weight_fraction_leaf=0.0, n_estimators=180, n_jobs=1,
                oob_score=True, random_state=0, verbose=0, warm_start=False)

### Min-max scaling

In [13]:
scaler = MinMaxScaler()
train_transformed = scaler.fit_transform(train)
train_normalized = pd.DataFrame(train_transformed,columns=train.columns.values)

test_transformed = scaler.fit_transform(test)
test_normalized = pd.DataFrame(test_transformed,columns=test.columns.values)

In [18]:
estimator_result(rf,train_normalized.values,train_y,test_normalized.values,test_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=31, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=180, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

Cross validation:
accuracy score 0.9079987026862029
AUROC 0.9638969035890218
________________________________________________________________________________
Training set:
accuracy score 0.9812792511700468
AUROC 0.9967757156338471
log-loss: 0.133048576357105
________________________________________________________________________________
Testing set;
accuracy score: 0.825
AUROC 0.92128445581131
log-loss: 0.37701898988805144
classification_report
             precision    recall  f1-score   support

          0       0.90      0.78      0.84        69
          1       0.75      0.88      

### Standardization

In [15]:
sscaler = StandardScaler()
train_transformed2 = sscaler.fit_transform(train)
train_normalized2 = pd.DataFrame(train_transformed2,columns=train.columns.values)

test_transformed2 = sscaler.fit_transform(test)
test_normalized2 = pd.DataFrame(test_transformed2,columns=test.columns.values)

In [19]:
estimator_result(rf,train_normalized2.values,train_y,test_normalized2.values,test_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=31, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=180, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

Cross validation:
accuracy score 0.9079987026862029
AUROC 0.9638969035890218
________________________________________________________________________________
Training set:
accuracy score 0.9812792511700468
AUROC 0.9967757156338471
log-loss: 0.133056763401149
________________________________________________________________________________
Testing set;
accuracy score: 0.85
AUROC 0.9332196646774651
log-loss: 0.355266306106166
classification_report
             precision    recall  f1-score   support

          0       0.89      0.84      0.87        69
          1       0.80      0.86      0

### Dicimal point movement

In [22]:
print(np.where((train*1e3)>1))
print(np.where((test*1e3)>1))

(array([], dtype=int64), array([], dtype=int64))
(array([], dtype=int64), array([], dtype=int64))


In [26]:
estimator_result(rf,(train*1e3).values,train_y,(test*1e3).values,test_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=31, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=180, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

Cross validation:
accuracy score 0.9079987026862029
AUROC 0.9638969035890218
________________________________________________________________________________
Training set:
accuracy score 0.9812792511700468
AUROC 0.9967855457691097
log-loss: 0.13300830537218758
________________________________________________________________________________
Testing set;
accuracy score: 0.9083333333333333
AUROC 0.9485649332196646
log-loss: 0.32833501100986334
classification_report
             precision    recall  f1-score   support

          0       0.89      0.96      0.92        69
          1       0.9

In [None]:
# lr = LogisticRegression(C=0.5,intercept_scaling=1,random_state=0)
# estimator_result(lr,new_train.values,train_y,new_test.values,test_y)

# lr = LogisticRegression(random_state=0)
# param_grid={
#     'C':[0.001,0.01,0.1,0.5,0.8,1],
#     'penalty':['l1','l2'],
#     'intercept_scaling':[0.5,1,2,3,4,5,10]
# }
# grid_search(lr,new_train.values,train_y,new_test.values,test_y,param_grid)