# ML draft

Method and code is adopted from https://github.com/morrismanfung/yukoproject2022/blob/main/02-model/001-knn.ipynb

## General Data Wrangling

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PowerTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
data_full = pd.read_csv( 'data/risk_factors_cervical_cancer.csv')

In [3]:
data_full.head()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [4]:
data_full[ 'target'] = data_full[ 'Hinselmann'] | data_full[ 'Schiller'] | data_full[ 'Citology'] | data_full[ 'Biopsy']
data_full = data_full.replace('?', np.nan)

In [5]:
data_full[ 'target'].value_counts()

0    756
1    102
Name: target, dtype: int64

In [6]:
STDs = ['STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis',
 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis', 'STDs:pelvic inflammatory disease',
 'STDs:genital herpes', 'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV']

for i in STDs:
    print( i)
    print( data_full[ i].value_counts())
    print()

STDs:condylomatosis
0.0    709
1.0     44
Name: STDs:condylomatosis, dtype: int64

STDs:cervical condylomatosis
0.0    753
Name: STDs:cervical condylomatosis, dtype: int64

STDs:vaginal condylomatosis
0.0    749
1.0      4
Name: STDs:vaginal condylomatosis, dtype: int64

STDs:vulvo-perineal condylomatosis
0.0    710
1.0     43
Name: STDs:vulvo-perineal condylomatosis, dtype: int64

STDs:syphilis
0.0    735
1.0     18
Name: STDs:syphilis, dtype: int64

STDs:pelvic inflammatory disease
0.0    752
1.0      1
Name: STDs:pelvic inflammatory disease, dtype: int64

STDs:genital herpes
0.0    752
1.0      1
Name: STDs:genital herpes, dtype: int64

STDs:molluscum contagiosum
0.0    752
1.0      1
Name: STDs:molluscum contagiosum, dtype: int64

STDs:AIDS
0.0    753
Name: STDs:AIDS, dtype: int64

STDs:HIV
0.0    735
1.0     18
Name: STDs:HIV, dtype: int64



In [7]:
Dxs = ['Dx:Cancer', 'Dx:CIN', 'Dx:HPV', 'Dx']
for i in Dxs:
    print( i)
    print( data_full[ i].value_counts())
    print()

Dx:Cancer
0    840
1     18
Name: Dx:Cancer, dtype: int64

Dx:CIN
0    849
1      9
Name: Dx:CIN, dtype: int64

Dx:HPV
0    840
1     18
Name: Dx:HPV, dtype: int64

Dx
0    834
1     24
Name: Dx, dtype: int64



In [9]:
columns_tbc = [ # to be considered
    'Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
    'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)', 'Dx:Cancer', 'Dx:CIN', 'Dx:HPV']

In [10]:
data_full[ columns_tbc].columns

Index(['Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)',
       'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)',
       'Dx:Cancer', 'Dx:CIN', 'Dx:HPV'],
      dtype='object')

In [11]:
numeric_features = [ 'Age', 'Number of sexual partners', 'First sexual intercourse',
       'Num of pregnancies', 'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives (years)', 'IUD (years)', 'STDs (number)']
binary_features = ['Dx:Cancer', 'Dx:CIN', 'Dx:HPV']

column_transformer = make_column_transformer(
    ( make_pipeline( SimpleImputer( strategy = 'median'), PowerTransformer(), StandardScaler()), numeric_features),
    ( SimpleImputer( strategy = 'most_frequent'), binary_features)
)

In [12]:
X = data_full[ columns_tbc]
y = data_full[ 'target']

In [13]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2, stratify = y, random_state = 123)

## KNN

In [14]:
pipe_knn = make_pipeline( column_transformer, KNeighborsClassifier()) # No class weight in KNN
cv_result = cross_validate( pipe_knn, X_train, y_train, cv = 5, return_train_score = True, scoring = 'recall')
pd.DataFrame( cv_result)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.166718,0.035096,0.0,0.046154
1,0.124094,0.030898,0.0,0.060606
2,0.15902,0.045257,0.0625,0.045455
3,0.112319,0.025005,0.0,0.060606
4,0.085793,0.051801,0.0,0.138462


In [15]:
param_grid = {
    "kneighborsclassifier__n_neighbors": list( range( 5, 35, 5))
}

grid_search = GridSearchCV(
    pipe_knn, param_grid, cv = 5, scoring = 'recall', n_jobs=-1, return_train_score = True
)

grid_search.fit( X_train, y_train)

results = pd.DataFrame(grid_search.cv_results_)
results[ ['params', 'mean_test_score']].sort_values( 'mean_test_score', ascending = False)

Unnamed: 0,params,mean_test_score
0,{'kneighborsclassifier__n_neighbors': 5},0.0125
1,{'kneighborsclassifier__n_neighbors': 10},0.0
2,{'kneighborsclassifier__n_neighbors': 15},0.0
3,{'kneighborsclassifier__n_neighbors': 20},0.0
4,{'kneighborsclassifier__n_neighbors': 25},0.0
5,{'kneighborsclassifier__n_neighbors': 30},0.0


In [16]:
y_hat = grid_search.predict( X_test)

In [17]:
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

[[150   2]
 [ 20   0]]
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       152
           1       0.00      0.00      0.00        20

    accuracy                           0.87       172
   macro avg       0.44      0.49      0.47       172
weighted avg       0.78      0.87      0.82       172



## SVC

In [18]:
pipe_svc = make_pipeline( column_transformer, SVC( class_weight = 'balanced'))
cv_result = cross_validate( pipe_svc, X_train, y_train, cv = 5, return_train_score = True, scoring = 'recall')
pd.DataFrame( cv_result)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.163721,0.03005,0.176471,0.630769
1,0.093608,0.029338,0.25,0.69697
2,0.164628,0.01885,0.3125,0.666667
3,0.098411,0.030297,0.375,0.757576
4,0.132948,0.049229,0.294118,0.6


In [19]:
param_dist = {
    'svc__C': [ 10**x for x in range( -2, 5)],
    'svc__gamma': [ 10**x for x in range( -2, 5)]
}

random_search = RandomizedSearchCV(
    pipe_svc, param_dist, n_iter = 30, cv = 5, scoring = 'recall', n_jobs=-1, return_train_score = True
)

random_search.fit( X_train, y_train)

In [20]:
results = pd.DataFrame( random_search.cv_results_)
results[ ['params', 'mean_test_score', 'mean_train_score']].sort_values( 'mean_test_score', ascending = False).head()

Unnamed: 0,params,mean_test_score,mean_train_score
17,"{'svc__gamma': 10000, 'svc__C': 0.1}",1.0,1.0
5,"{'svc__gamma': 1000, 'svc__C': 0.1}",1.0,1.0
26,"{'svc__gamma': 10, 'svc__C': 0.1}",0.9875,1.0
23,"{'svc__gamma': 0.1, 'svc__C': 0.1}",0.4875,0.545641
1,"{'svc__gamma': 0.01, 'svc__C': 1}",0.427206,0.472541


In [23]:
y_hat = random_search.predict( X_test)
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

[[152   0]
 [ 20   0]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       152
           1       0.00      0.00      0.00        20

    accuracy                           0.88       172
   macro avg       0.44      0.50      0.47       172
weighted avg       0.78      0.88      0.83       172



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Try a less complex model to reduce overfitting.

In [24]:
pipe_svc_opt = make_pipeline( column_transformer, SVC( class_weight = 'balanced', gamma = 10, C = 0.1))
pipe_svc_opt.fit( X_train, y_train)
y_hat = pipe_svc_opt.predict( X_test)
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

[[152   0]
 [ 20   0]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.94       152
           1       0.00      0.00      0.00        20

    accuracy                           0.88       172
   macro avg       0.44      0.50      0.47       172
weighted avg       0.78      0.88      0.83       172



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest (Let's give it a try :))

In [25]:
from sklearn.ensemble import RandomForestClassifier
pipe_rfc = make_pipeline( column_transformer, RandomForestClassifier())
cv_result = cross_validate( pipe_rfc, X_train, y_train, cv = 5, return_train_score = True, scoring = 'recall')
pd.DataFrame( cv_result)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.561667,0.059207,0.117647,0.984615
1,0.517966,0.035004,0.0,0.984848
2,0.482249,0.050875,0.0,0.984848
3,0.405943,0.045113,0.0,0.969697
4,0.413298,0.041038,0.176471,0.969231


In [26]:
param_dist = {
    'randomforestclassifier__n_estimators': [ 100*x for x in range( 1, 11)],
    'randomforestclassifier__max_depth': [ 10*x for x in range( 1, 11)],
    'randomforestclassifier__max_features': [ 'sqrt', 'log2'],
    'randomforestclassifier__criterion': [ 'gini', 'entropy', 'log_loss'],
    'randomforestclassifier__bootstrap': [ True, False]
}

random_search = RandomizedSearchCV(
    pipe_rfc, param_dist, n_iter = 30, cv = 5, scoring = 'recall', n_jobs=-1, return_train_score = True, random_state = 918
)

random_search.fit( X_train, y_train)

results = pd.DataFrame(random_search.cv_results_)
results[ ['params', 'mean_test_score']].sort_values( 'mean_test_score', ascending = False).head()

Unnamed: 0,params,mean_test_score
27,"{'randomforestclassifier__n_estimators': 100, ...",0.083824
23,"{'randomforestclassifier__n_estimators': 400, ...",0.083088
17,"{'randomforestclassifier__n_estimators': 700, ...",0.071324
22,"{'randomforestclassifier__n_estimators': 700, ...",0.071324
12,"{'randomforestclassifier__n_estimators': 300, ...",0.071324


In [27]:
print( results[ ['params', 'mean_test_score']].sort_values( 'mean_test_score', ascending = False).iloc[ 0, 0])
y_hat = random_search.predict( X_test)
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

{'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__max_depth': 90, 'randomforestclassifier__criterion': 'entropy', 'randomforestclassifier__bootstrap': False}
[[149   3]
 [ 18   2]]
              precision    recall  f1-score   support

           0       0.89      0.98      0.93       152
           1       0.40      0.10      0.16        20

    accuracy                           0.88       172
   macro avg       0.65      0.54      0.55       172
weighted avg       0.83      0.88      0.84       172



## Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB

pipe_nb = make_pipeline( column_transformer, GaussianNB())
cv_result = cross_validate( pipe_nb, X_train, y_train, cv = 5, return_train_score = True, scoring = 'recall')
pd.DataFrame( cv_result)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.095623,0.037316,0.294118,0.169231
1,0.069891,0.013005,0.0625,0.181818
2,0.070011,0.013422,0.125,0.19697
3,0.066693,0.019947,0.25,0.151515
4,0.065566,0.013081,0.117647,0.2


In [29]:
pipe_nb.fit( X_train, y_train) # As no hyperparameter optimization for Gaussian naive Bayes
y_hat = pipe_nb.predict( X_test)
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

[[147   5]
 [ 12   8]]
              precision    recall  f1-score   support

           0       0.92      0.97      0.95       152
           1       0.62      0.40      0.48        20

    accuracy                           0.90       172
   macro avg       0.77      0.68      0.72       172
weighted avg       0.89      0.90      0.89       172



## Logistic Regression

In [33]:
from sklearn.linear_model import LogisticRegression
pipe_logreg = make_pipeline( column_transformer, LogisticRegression( max_iter = 1000, solver = 'saga', class_weight = 'balanced', random_state = 123))
cv_result = cross_validate( pipe_logreg, X_train, y_train, cv = 5, return_train_score = True, scoring = 'recall')
pd.DataFrame( cv_result)

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.196229,0.01093,0.529412,0.569231
1,0.103071,0.015999,0.375,0.606061
2,0.099153,0.008077,0.625,0.545455
3,0.153031,0.027822,0.5625,0.575758
4,0.093169,0.01474,0.352941,0.523077


In [34]:
param_dist = {
    'logisticregression__C': [ 10**x for x in range( -2, 5)],
    'logisticregression__penalty': [ 'none', 'l2', 'l1', 'elasticnet'],
    'logisticregression__l1_ratio': [ 0, 0.5, 1]
}

grid_search = GridSearchCV(
    pipe_logreg, param_dist, cv = 5, scoring = 'precision', n_jobs=-1, return_train_score = True
)

grid_search.fit( X_train, y_train)

results = pd.DataFrame( grid_search.cv_results_)
results[ ['params', 'mean_test_score']].sort_values( 'mean_test_score', ascending = False).head()



Unnamed: 0,params,mean_test_score
35,"{'logisticregression__C': 1, 'logisticregressi...",0.203434
34,"{'logisticregression__C': 1, 'logisticregressi...",0.203434
30,"{'logisticregression__C': 1, 'logisticregressi...",0.203434
26,"{'logisticregression__C': 1, 'logisticregressi...",0.203434
19,"{'logisticregression__C': 0.1, 'logisticregres...",0.202316


In [35]:
results[ ['params', 'mean_test_score']].sort_values( 'mean_test_score', ascending = False).iloc[ 0, 0]

{'logisticregression__C': 1,
 'logisticregression__l1_ratio': 1,
 'logisticregression__penalty': 'elasticnet'}

In [37]:
y_hat = grid_search.predict( X_test)
print( confusion_matrix( y_test, y_hat, labels = [ 0, 1]))
print( classification_report( y_test, y_hat))

[[108  44]
 [ 12   8]]
              precision    recall  f1-score   support

           0       0.90      0.71      0.79       152
           1       0.15      0.40      0.22        20

    accuracy                           0.67       172
   macro avg       0.53      0.56      0.51       172
weighted avg       0.81      0.67      0.73       172

