## ML improving model

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import preprocessing

In [2]:
df = pd.read_csv('ml-data.csv', delimiter=',', encoding= 'unicode_escape')

In [3]:
df = df.drop(['Diagnosed','Months','Tumor size','COD2','COD','Malignat number','Sequence number','Id'], axis = 1)

In [4]:
df.loc[(df['Survive']=='Dead in 10') | (df['Survive']=='Dead in 10+'), 'Survive'] ='Dead in 5+'

In [5]:
df.loc[(df['Survive']=='Dead in 5+') ]

Unnamed: 0,Age,Race,Sex,Diagnosed group,Stage,Site,Report,Property,Tumor size group,Surgery,Chemotherapy,Radiotherapy,Survive
10,15-19,White,Male,1975-1999,Localized,Bones and Joints,Hospital inpatient/outpatient or clinic,"$75,000+",< 85,No,Yes,No,Dead in 5+
18,15-19,White,Male,1975-1999,Regional,Bones and Joints,Hospital inpatient/outpatient or clinic,"$75,000+",85-115,Unknown,Yes,Yes,Dead in 5+
20,15-19,White,Male,1975-1999,Regional,Bones and Joints,Hospital inpatient/outpatient or clinic,"< $60,999",85-115,No,Yes,No,Dead in 5+
28,10-14,White,Female,1975-1999,Distant,Bones and Joints,Hospital inpatient/outpatient or clinic,"$75,000+",85-115,Unknown,Yes,No,Dead in 5+
41,10-14,White,Male,1975-1999,Distant,Bones and Joints,Hospital inpatient/outpatient or clinic,"< $60,999",85-115,No,Yes,No,Dead in 5+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1881,05-09,White,Female,2010-2018,Regional,Bones and Joints,Hospital inpatient/outpatient or clinic,"$60,000 - $74,999",< 85,Yes,Yes,Yes,Dead in 5+
1892,05-09,Other,Male,2010-2018,Localized,Bones and Joints,Hospital inpatient/outpatient or clinic,"$60,000 - $74,999",> 115,Yes,Yes,No,Dead in 5+
1898,10-14,White,Female,2010-2018,Regional,Bones and Joints,Hospital inpatient/outpatient or clinic,"$60,000 - $74,999",< 85,Yes,Yes,Yes,Dead in 5+
2043,10-14,White,Male,2010-2018,Localized,Bones and Joints,Hospital inpatient/outpatient or clinic,"$75,000+",< 85,No,Yes,No,Dead in 5+


## Final attributes

In [6]:
df.columns

Index(['Age', 'Race', 'Sex', 'Diagnosed group', 'Stage', 'Site', 'Report',
       'Property', 'Tumor size group', 'Surgery', 'Chemotherapy',
       'Radiotherapy', 'Survive'],
      dtype='object')

### One hot Encoding - columns

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
df_ohe = df
categorical_columns = ['Age', 'Race', 'Sex','Diagnosed group', 'Stage', 'Site', 'Report',
       'Property', 'Tumor size group', 'Surgery', 'Chemotherapy',
       'Radiotherapy']

for col in categorical_columns:
    col_ohe = pd.get_dummies(df[col], prefix=col)
    df_ohe = pd.concat((df_ohe, col_ohe), axis=1).drop(col, axis=1)
    
print(df_ohe.columns)

Index(['Survive', 'Age_00', 'Age_01-04', 'Age_05-09', 'Age_10-14', 'Age_15-19',
       'Race_Black', 'Race_Other', 'Race_White', 'Sex_Female', 'Sex_Male',
       'Diagnosed group_1975-1999', 'Diagnosed group_2000-2009',
       'Diagnosed group_2010-2018', 'Stage_Distant', 'Stage_Localized',
       'Stage_Regional', 'Site_Bones and Joints',
       'Site_Kidney and Renal Pelvis', 'Site_Other',
       'Site_Soft Tissue including Heart', 'Report_Death certificate only',
       'Report_Hospital inpatient/outpatient or clinic',
       'Report_Laboratory only (hospital or private)',
       'Report_Other hospital outpatient unit or surgery center (2006+)',
       'Report_Physicians office/private medical practitioner (LMD)',
       'Report_Radiation treatment or medical oncology center (2006+)',
       'Property_$60,000 - $74,999', 'Property_$75,000+', 'Property_< $60,999',
       'Tumor size group_85-115', 'Tumor size group_< 85',
       'Tumor size group_> 115', 'Surgery_No', 'Surgery_Unknow

## Make survive as category

### Split on training and test data

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
training, test = train_test_split(df_ohe, test_size=0.25, random_state=35)

In [11]:
features = training.drop(["Survive"], axis=1).columns

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    df_ohe[features], df_ohe['Survive'], test_size=0.25, random_state=35)

## Linear SVM

In [13]:
from sklearn.metrics import classification_report

In [14]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear', C=0.5, degree=9)
clf_svm.fit(training[features], training['Survive'])
print(classification_report(test['Survive'], clf_svm.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.78      0.67      0.72       206
  Dead in 5+       0.00      0.00      0.00        34
     Survive       0.80      0.92      0.86       406

    accuracy                           0.79       646
   macro avg       0.53      0.53      0.53       646
weighted avg       0.75      0.79      0.77       646



  _warn_prf(average, modifier, msg_start, len(result))


## RBF SVM

In [15]:
clf_svm_rbf = svm.SVC(kernel='rbf')
clf_svm_rbf.fit(training[features], training['Survive'])
print(classification_report(test['Survive'], clf_svm_rbf.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.83      0.69      0.75       206
  Dead in 5+       0.00      0.00      0.00        34
     Survive       0.81      0.94      0.87       406

    accuracy                           0.81       646
   macro avg       0.55      0.55      0.54       646
weighted avg       0.77      0.81      0.79       646



  _warn_prf(average, modifier, msg_start, len(result))


### Select the best hyperparameter for SVM


In [16]:
from sklearn.model_selection import GridSearchCV

In [15]:
param_grid = [{'kernel': ['rbf','linear'], 
               'gamma': [1e-3, 1e-5],
                'C': [1000, 1100, 1300], 
                'degree' : [1,3,6]}]
clf = GridSearchCV(svm.SVC(), param_grid, cv=2)
clf.fit(X_train, y_train)

GridSearchCV(cv=2, estimator=SVC(),
             param_grid=[{'C': [1000, 1100, 1300], 'degree': [1, 3, 6],
                          'gamma': [0.001, 1e-05],
                          'kernel': ['rbf', 'linear']}])

In [16]:
print(clf.best_params_)

{'C': 1300, 'degree': 1, 'gamma': 0.001, 'kernel': 'rbf'}


In [56]:
print(classification_report(test['Survive'], clf.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.82      0.70      0.76       206
  Dead in 5+       0.00      0.00      0.00        34
     Survive       0.81      0.94      0.87       406

    accuracy                           0.81       646
   macro avg       0.54      0.55      0.54       646
weighted avg       0.77      0.81      0.79       646



  _warn_prf(average, modifier, msg_start, len(result))


# 

## Decision tree optimisation

In [17]:
from sklearn import tree
clf_dtree = tree.DecisionTreeClassifier(max_leaf_nodes = 20, max_depth= 20)

clf_dtree.fit(training[features], training['Survive'])
print(classification_report(test['Survive'], clf_dtree.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.81      0.69      0.74       206
  Dead in 5+       0.33      0.12      0.17        34
     Survive       0.82      0.93      0.87       406

    accuracy                           0.81       646
   macro avg       0.65      0.58      0.60       646
weighted avg       0.79      0.81      0.79       646



In [34]:
tree_para = [{'max_leaf_nodes' : [20,25,30,35,40,45],
             'max_depth':[20,25,30,35],
              'min_samples_split': [2, 3, 4]
             }]
clf_dt = GridSearchCV(tree.DecisionTreeClassifier(), tree_para, cv=2)
clf_dt.fit(training[features], training['Survive'])

GridSearchCV(cv=2, estimator=DecisionTreeClassifier(),
             param_grid=[{'max_depth': [20, 25, 30, 35],
                          'max_leaf_nodes': [20, 25, 30, 35, 40, 45],
                          'min_samples_split': [2, 3, 4]}])

In [35]:
print(clf_dt.best_params_)

{'max_depth': 25, 'max_leaf_nodes': 40, 'min_samples_split': 3}


In [36]:
print(classification_report(test['Survive'], clf_dt.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.82      0.71      0.76       206
  Dead in 5+       0.50      0.15      0.23        34
     Survive       0.83      0.93      0.88       406

    accuracy                           0.82       646
   macro avg       0.72      0.60      0.62       646
weighted avg       0.81      0.82      0.81       646



# 

## Random forest optimisation

In [39]:
from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(max_depth=3, random_state=0)

clf_rf.fit(training[features], training['Survive'])
print(classification_report(test['Survive'], clf_rf.predict(test[features])))

              precision    recall  f1-score   support

   Dead in 5       0.87      0.45      0.59       206
  Dead in 5+       0.00      0.00      0.00        34
     Survive       0.73      0.97      0.83       406

    accuracy                           0.75       646
   macro avg       0.53      0.47      0.47       646
weighted avg       0.73      0.75      0.71       646



  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
rf_para = [{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}]

In [None]:
clf_rf = GridSearchCV(RandomForestClassifier(), rf_para, cv=2)
clf_rf.fit(training[features], training['Survive'])

In [None]:
print(clf_rf.best_params_)
print(classification_report(test['Survive'], clf_rf.predict(test[features])))

# 

## KNN optimization

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=5,weights='distance')

clf_knn.fit(training[features], training['Survive'])
print(classification_report(test['Survive'], clf_knn.predict(test[features])))

In [None]:
knn_para = [{
    'n_neighbors' : [3,5,7,9,11,13,15],
    'weight_options' : ["uniform", "distance"],
    'leaf_size' : [10,30,50,70,90,110]  
}]

In [None]:
clf_knn = GridSearchCV(KNeighborsClassifier(), knn_para, cv=2)
clf_knn.fit(training[features], training['Survive'])