# Machine Learning Applied to Cervical Cancer Data
- Given a dataset with 36 columns
  - 32 feature columns, 4 target columns
- Created a model using KNN to predict presence of cervical cancer for given feature values for a patient.

## Importing data and libraries

In [1]:
from google.colab import files
uploaded = files.upload()

Saving cervical-cancer.csv to cervical-cancer.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [11]:
df = pd.read_csv("cervical-cancer.csv")

In [12]:
df.describe()

Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,IUD (years),STDs,STDs (number),STDs:condylomatosis,STDs:cervical condylomatosis,STDs:vaginal condylomatosis,STDs:vulvo-perineal condylomatosis,STDs:syphilis,STDs:pelvic inflammatory disease,STDs:genital herpes,STDs:molluscum contagiosum,STDs:AIDS,STDs:HIV,STDs:Hepatitis B,STDs:HPV,STDs: Number of diagnosis,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
count,835.0,810.0,828.0,779.0,822.0,822.0,822.0,732.0,732.0,723.0,723.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,735.0,835.0,71.0,71.0,835.0,835.0,835.0,835.0,835.0,835.0,835.0,835.0
mean,27.023952,2.551852,17.020531,2.304236,0.149635,1.25385,0.465823,0.651639,2.302916,0.114799,0.527621,0.107483,0.180952,0.059864,0.0,0.005442,0.058503,0.02449,0.001361,0.001361,0.001361,0.0,0.02449,0.001361,0.002721,0.08982,6.140845,5.816901,0.021557,0.010778,0.021557,0.028743,0.041916,0.087425,0.051497,0.064671
std,8.482986,1.676686,2.817,1.455817,0.35693,4.140727,2.256273,0.476777,3.79418,0.319,1.965439,0.309937,0.568153,0.237396,0.0,0.07362,0.234853,0.154669,0.036886,0.036886,0.036886,0.0,0.154669,0.036886,0.052129,0.306335,5.895024,5.755271,0.145319,0.10332,0.145319,0.167182,0.200518,0.282626,0.221142,0.246091
min,13.0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,2.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,2.0,17.0,2.0,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,32.0,3.0,18.0,3.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,84.0,28.0,32.0,11.0,1.0,37.0,37.0,1.0,30.0,1.0,19.0,1.0,4.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,3.0,22.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Engineering

- Adding values of different test into a new target and dropping the 4 target columns(tests columns)
- Replacing values in target column as: if value > 0 value = 1 else value = 0
- Using the 25 columns mentioned in the paper
- Replacing Nan values by median of the columns

In [13]:
# Making a single target column
target_columns = ['Hinselmann', 'Schiller', 'Citology', 'Biopsy']
target_column = df['Hinselmann'] + df['Schiller'] + df['Citology'] + df['Biopsy'] 
df['target'] = target_column
df.loc[df['target'] != 0, ['target']] = 1

# feature selection
features_selected = ['Age', 'Number of sexual partners', 'First sexual intercourse', 'Num of pregnancies', 'Smokes', 'Smokes (years)', 'Smokes (packs/year)','Hormonal Contraceptives', 'Hormonal Contraceptives (years)', 
'IUD', 'IUD (years)', 'STDs', 'STDs (number)', 'STDs:condylomatosis', 'STDs:cervical condylomatosis', 'STDs:vaginal condylomatosis', 'STDs:vulvo-perineal condylomatosis', 'STDs:syphilis','STDs:pelvic inflammatory disease', 'STDs:genital herpes',
'STDs:molluscum contagiosum', 'STDs:AIDS', 'STDs:HIV','STDs:Hepatitis B', 'STDs:HPV', 'target']

for i in df.columns:
  if i not in features_selected:
    df = df.drop(i, axis="columns")

# Replacing Nan by median
  
df = df.replace('?', np.NaN)
for i in df.columns:
  df[i] = df[i].fillna((df[i].median()))
  

## Model, predictions and accuracy
- Using K_fold with 5 folds
- using KNN with 5 neighbors
- using confusion matrix for model accuracy evaluation

In [14]:
X = df.drop('target', axis = 1) 
y = df.target
scores=[]


kf = KFold(n_splits=5)

for train_index, test_index in kf.split(df):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

  model = KNeighborsClassifier(n_neighbors = 5, metric='euclidean')
  model.fit(X_train, y_train)
  
  y_pred = model.predict(X_test) 

  print("Score:", metrics.accuracy_score(y_test, y_pred))
  cm = confusion_matrix(y_test, y_pred)
  print(cm, "\n")

'''confusion matrix 
  [[TP, FN]
  [FP, TN]]
'''


Score: 0.8622754491017964
[[144   4]
 [ 19   0]] 

Score: 0.8862275449101796
[[148   2]
 [ 17   0]] 

Score: 0.8562874251497006
[[143   1]
 [ 23   0]] 

Score: 0.8502994011976048
[[142   0]
 [ 25   0]] 

Score: 0.8922155688622755
[[149   1]
 [ 17   0]] 



'confusion matrix \n  [[TP, FN]\n  [FP, TN]]\n'

## RDBMS IA 2 -> Implementation of paper

- 1911089 Vedant