# Load Dataset

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
dataset=pd.read_excel("Data/trainDataset.xls")
dataset=dataset[dataset["pCR (outcome)"]!=999]
dataset.head(len(dataset))

Unnamed: 0,ID,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,TRG002174,1,144.000000,41.0,0,0,0,1,3,3,...,0.517172,0.375126,3.325332,0.002314,3.880772e+06,473.464852,0.000768,0.182615,0.030508,0.000758
1,TRG002178,0,142.000000,39.0,1,1,0,0,3,3,...,0.444391,0.444391,3.032144,0.005612,2.372010e+06,59.459710,0.004383,0.032012,0.001006,0.003685
2,TRG002204,1,135.000000,31.0,0,0,0,1,2,1,...,0.534549,0.534549,2.485848,0.006752,1.540027e+06,33.935384,0.007584,0.024062,0.000529,0.006447
3,TRG002206,0,12.000000,35.0,0,0,0,1,3,3,...,0.506185,0.506185,2.606255,0.003755,6.936741e+06,46.859265,0.005424,0.013707,0.000178,0.004543
4,TRG002210,0,109.000000,61.0,1,0,0,0,2,1,...,0.462282,0.462282,2.809279,0.006521,1.265399e+06,39.621023,0.006585,0.034148,0.001083,0.005626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,TRG002955,1,49.250000,46.1,0,0,0,1,3,3,...,0.439568,0.439568,3.056046,0.001339,1.671271e+07,79.989003,0.003282,0.024716,0.000812,0.003078
396,TRG002958,0,48.500000,53.3,0,0,0,1,2,1,...,0.527779,0.527778,1.500000,0.003728,2.132007e+05,0.996746,0.252582,0.007380,0.000037,0.231059
397,TRG002961,0,47.500000,68.8,1,0,0,0,3,3,...,0.313693,0.313693,3.573557,0.001112,2.008034e+07,204.864200,0.001372,0.054063,0.003697,0.001368
398,TRG002962,0,46.916667,46.0,1,0,0,0,2,1,...,0.670229,0.670229,1.857045,0.006706,5.609262e+05,9.609163,0.026591,0.018682,0.000311,0.022676


# Data Preprocessing

### Removing duplicate

In [2]:
dataset.drop_duplicates(inplace=True) #because with 119 features, it is unlikely that exact same data happens in real world, it is useless for training and testing; hence remove it before dataset split

### Split Dataset

In [3]:
dataset['pCR (outcome)'].value_counts()

0    299
1     96
Name: pCR (outcome), dtype: int64

In [4]:
y=dataset['pCR (outcome)']
X=dataset.drop(['pCR (outcome)','RelapseFreeSurvival (outcome)'],axis=1)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,stratify=y,random_state=2)

### drop columns that are not applicable

In [5]:
class columnDropperTransformer(): #https://stackoverflow.com/questions/68402691/adding-dropping-column-instance-into-a-pipeline
    def __init__(self,columns):
        self.columns=columns

    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit_transform(self,X,y=None):
        return X.drop(self.columns,axis=1)


### missing data

In [6]:
dataset.isnull().sum()

ID                               0
pCR (outcome)                    0
RelapseFreeSurvival (outcome)    0
Age                              0
ER                               0
                                ..
original_ngtdm_Busyness          0
original_ngtdm_Coarseness        0
original_ngtdm_Complexity        0
original_ngtdm_Contrast          0
original_ngtdm_Strength          0
Length: 120, dtype: int64

In [None]:
# from sklearn.impute import KNNImputer

# names=X_train.columns

# imp=KNNImputer(missing_values=np.nan,n_neighbors=200)
# # X_train[:]=imp.fit_transform(X_train)
# # X_test[:]=imp.transform(X_test)
# X_train=imp.fit_transform(X_train)
# X_test=imp.transform(X_test)

# imp.missing_values=np.float64(999)
# # X_train[:]=imp.fit_transform(X_train)
# # X_test[:]=imp.transform(X_test)
# X_train=imp.fit_transform(X_train)
# X_test=imp.transform(X_test)

### Removing outliers

In [7]:
class outlierHandlingWithDev():
    def fit(self,X,y=None):
        self.mean=np.zeros(len(X[0]))
        self.std=np.zeros(len(X[0]))
        for i in range(len(X[0])):
            self.mean[i]=X[:,i].mean()
            self.std[i]=X[:,i].std()

        return self

    def transform(self,X,y=None):
        for i in range(len(X)):
            for j in range(len(X[i])):
                if np.abs(X[i][j]-self.mean[j])>3*self.std[j]:
                    X[i][j]=999
        return X

    def fit_transform(self,X,y=None):
        self.fit(X,y)
        return self.transform(X,y)

### Binning

In [None]:
# from sklearn.preprocessing import Binarizer
# datasetCopy=dataset.iloc[:,3].values.reshape(-1,1)
# transformor=Binarizer(threshold=40).fit(datasetCopy)
# transformor
# x1=transformor.transform(datasetCopy)
# dataset.iloc[:,3]=x1
# dataset

In [None]:
# datasetCopy=dataset.iloc[:,3].values.reshape(-1,1)
# transformor=Binarizer(threshold=40).fit(datasetCopy)
# transformor
# x1=transformor.transform(datasetCopy)
# dataset.iloc[:,3]=x1
# dataset

In [None]:
# from sklearn.preprocessing import KBinsDiscretizer
# datasetCopy=dataset.iloc[:,3].values.reshape(-1,1)
# # 不能导入一维，所以reshape
# est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
# est.fit(datasetCopy)
# Xt = est.transform(datasetCopy)
# set(Xt.ravel())
# dataset.iloc[:,3]=Xt
# dataset

### MinMaxScaler

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler=MinMaxScaler()
# X_train[:]=scaler.fit_transform(X_train)
# X_test[:]=scaler.transform(X_test)
# X_train
# from sklearn.preprocessing import StandardScaler
# scaler=StandardScaler()
# X_train[:]=scaler.fit_transform(X_train)
# X_test[:]=scaler.transform(X_test)


### PCA

In [None]:
# corr=X_train.corr()
# corr=corr.where(abs(corr)>0.9)
# count = 0
# corr_fea = []

# for i in corr.columns:
#     for j in corr.columns:
#         if abs(corr[i][j])>0.9 and i!=j:
#             if i not in corr_fea:
#                 count+=1
#                 corr_fea.append(i)

#             if j not in corr_fea:
#                 count+=1
#                 corr_fea.append(j)

# count

In [None]:
# def is_related(pair1, pair2):
#     if (pair1[0] in pair2 or pair1[1] in pair2) and pair1!=pair2 and not (pair1[0]==pair2[1] and pair1[1]==pair2[0]):
#         return True
#     return False

# def add_relation(corr_fea):
#     for i in range(len(corr_fea)):
#         for j in range(i,len(corr_fea)):
#             if i!=j:
#                 if is_related(corr_fea[i],corr_fea[j]):
#                     fea1=tuple(set(corr_fea[i])-set(corr_fea[j]))
#                     fea2=tuple(set(corr_fea[j])-set(corr_fea[i]))
#                     fea=fea1+fea2
#                     if fea not in corr_fea and rev(fea) not in corr_fea:
#                         corr_fea.append((fea1+fea2))

# def rev(fea):
#     f1=fea[0]
#     f2=fea[1]

#     return (f2,f1)

# corr=X_train.corr()
# corr=corr.where(abs(corr)>0.9)
# count=0
# corr_fea=[]
# for i in corr.columns:
#     for j in corr.columns:
#         if i!=j and abs(corr[i][j])>0.9 and ((i,j) not in corr_fea) and ((j,i) not in corr_fea):
#             count+=1
#             corr_fea.append((i,j))
#             add_relation(corr_fea)


### Feature Selection

### resample

In [None]:
# from imblearn.over_sampling import SMOTE
# from imblearn.combine import SMOTETomek

# smote=SMOTETomek()

# X_train,y_train=smote.fit_resample(X_train,y=y_train)


In [8]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks 
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

para={'imputeNull__n_neighbors':[10,100,200],
    'imputeMissing__n_neighbors':[10,100,200],
    'imputeOutlier__n_neighbors':[10,100,200],
    'pca__n_components':[20,40,60,80],
    'resample__smote':[SMOTE(sampling_strategy=0.6),SMOTE(sampling_strategy=0.8),
                       SMOTE(sampling_strategy=1.0),SMOTE(sampling_strategy=1.2),
                       SMOTE(sampling_strategy=1.4)],
    'rf__class_weight':['balanced'],
    'rf__max_depth':[10,20,30],
    'rf__max_features':[2,3,4,5],
    'rf__n_estimators':[100,200,300]
}

pipe=Pipeline([('drop',columnDropperTransformer(['ID'])),
    ('imputeNull',KNNImputer(missing_values=np.nan,n_neighbors=200)),
    ('imputeMissing',KNNImputer(missing_values=np.float64(999),n_neighbors=200)),
    ('outlier',outlierHandlingWithDev()),
    ('imputeOutlier',KNNImputer(missing_values=np.float64(999),n_neighbors=200)),
    ('scaler',StandardScaler()),
    ('pca',PCA(n_components=60)),
    ('resample',SMOTETomek(smote=SMOTE(sampling_strategy=1.0),tomek=TomekLinks())),
    ('rf',RandomForestClassifier())
])

grid=GridSearchCV(pipe,param_grid=para,n_jobs=-1,cv=KFold(n_splits=3))

grid.fit(X_train,y_train)
print("Best parameter (CV score=%0.3f):" % grid.best_score_)
print(grid.best_params_)

pipe.fit(X_train,y_train)
print(classification_report(y_test,pipe.predict(X_test)))
confusion_matrix(y_test,pipe.predict(X_test))


In [None]:
for param in grid.get_params().keys():
    print(param)

# Methods

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.feature_selection import SequentialFeatureSelector

clf=MLPClassifier(hidden_layer_sizes=(20,30),activation="relu",alpha=0.04,max_iter=1000,batch_size=20,random_state=global_seed,early_stopping=True,shuffle=True)
score=cross_val_score(clf,X_train,y_train,cv=KFold(n_splits=5,shuffle=True,random_state=global_seed))
print(score.mean())

clf.fit(X_train,y_train)
print(classification_report(y_test,clf.predict(X_test)))
confusion_matrix(y_test,clf.predict(X_test))
# confusion_matrix(y_train,clf.predict(X_train))
# seq.fit(X_train,y_train)
# X_train=seq.transform(X_train)
# X_test=seq.transform(X_test)
# clf.fit(X_train,y_train)
# print(classification_report(y_test,clf.predict(X_test)))
# confusion_matrix(y_test,clf.predict(X_test))

In [None]:
# from sklearn.metrics import confusion_matrix
# from sklearn.tree import  DecisionTreeClassifier
# from sklearn.metrics import classification_report
# from sklearn.feature_selection import SequentialFeatureSelector
# clf=DecisionTreeClassifier(class_weight='balanced',random_state=global_seed)
# # clf.fit(X_train,y_train)
# # clf.predict(X_test)
#
# clf.fit(X_train,y_train)
#
# print(classification_report(y_test,clf.predict(X_test)))
# confusion_matrix(y_test,clf.predict(X_test))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
clf=MLPClassifier(hidden_layer_sizes=(23,30),alpha=0.1,activation="relu",max_iter=500,batch_size=20)
# cross_val_score(clf,X_train,y_train,cv=3)
from sklearn.feature_selection import SequentialFeatureSelector
# sfs=SequentialFeatureSelector(clf,n_features_to_select="auto")
#
clf.fit(X_train,y_train)
print(classification_report(y_test,clf.predict(X_test)))
confusion_matrix(y_test,clf.predict(X_test))
confusion_matrix(y_train,clf.predict(X_train))

In [None]:
from sklearn.svm import SVC
clf=SVC(kernel="poly",random_state=global_seed,degree=2)
para={'kernel':["poly","linear","rbf"],'degree':[1,2,3,4,5],'class_weight':[{0:1.,1:2.},{0:1.,1:20.},None]}
cv=GridSearchCV(clf,param_grid=para,cv=KFold(n_splits=5,random_state=global_seed,shuffle=True))
cv.fit(X_train,y_train)
print(cv.best_params_)
print(confusion_matrix(cv.predict(X_test),y_test))
print(classification_report(cv.predict(X_test),y_test))

# clf.fit(X_train,y_train)
confusion_matrix(y_true=y_train,y_pred=cv.predict(X_train))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf=KNeighborsClassifier(weights="distance")
para={'n_neighbors':[2,3,4,5,6,7,8,9,10,12,15,19]}
cv=GridSearchCV(clf,param_grid=para,cv=KFold(n_splits=5,random_state=global_seed,shuffle=True))
cv.fit(X_train,y_train)
print(confusion_matrix(cv.predict(X_test),y_test))

# cv.estimator.fit(X_train,y_train)
# confusion_matrix(y_true=y_test,y_pred=clf.predict(X_test))
print(classification_report(y_test,cv.predict(X_test)))

In [None]:
# from sklearn.utils import class_weight
# import tensorflow as tf
# from tensorflow import keras
# model=keras.Sequential()
# model.add(keras.layers.Dense(20,input_dim=len(X_train[0]),activation="relu"))
# model.add(keras.layers.Dense(10,activation="relu"))
# model.add(keras.layers.Dense(1,activation="sigmoid"))
# model.summary()
#
# model.compile(loss="binary_crossentropy",optimizer="adam",metrics=[tf.keras.metrics.Precision(),"accuracy"])
# # model.save_weights('model.h5')
# # model.load_weights('model.h5')
# history=model.fit(X_train,np.array(y_train),epochs=20,validation_data=(X_test,np.array(y_test)),batch_size=20)
# y_pred=model.predict(X_test)
# loss,precision,acc=model.evaluate(X_test,np.array(y_test),verbose=2)

