In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

import random
from itertools import combinations
import matplotlib.pyplot as plt

from test_model import test_model

## Build and clean the dataset

We also standardize the values of our colums here to prevent some subtle influences.

In [2]:
# Comma-separated values
df = pd.read_csv("cleveland.csv")

# Rename 'num' column to 'disease' and change 1,2,3,4 to 1
df = df.rename({"num": "disease"}, axis=1)
df["disease"] = df.disease.apply(lambda x: min(x, 1))

# Fix some of the question marks not being interpreted as null
df = df[df['ca'] != '?']
df = df[df['thal'] != '?']

df['ca'] = df['ca'].astype('float')
df['thal'] = df['thal'].astype('float')

std_df = df.copy()

# Standardize all but the disease column at the end
for column in df.columns[:-1]:
    std_df[column] = (df[column]-df[column].mean())/df[column].std()
    
display(df.head(10))
display(std_df.head(10))

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,1
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,1
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,disease
0,0.934603,0.68993,-2.236854,0.749116,-0.275978,2.426332,1.008496,0.017465,-0.695246,1.067164,2.26033,-0.72076,0.654772,0
1,1.376605,0.68993,0.872408,1.593577,0.743301,-0.410757,1.008496,-1.813274,1.433497,0.38113,0.642696,2.474249,-0.892713,1
2,1.376605,0.68993,0.872408,-0.65832,-0.352904,-0.410757,1.008496,-0.897904,1.433497,1.324427,0.642696,1.409246,1.170601,1
3,-1.938409,0.68993,-0.164013,-0.095345,0.050961,-0.410757,-1.001728,1.630258,-0.695246,2.096215,2.26033,-0.72076,-0.892713,0
4,-1.496407,-1.444542,-1.200433,-0.095345,-0.833696,-0.410757,1.008496,0.976423,-0.695246,0.295376,-0.974938,-0.72076,-0.892713,0
5,0.1611,0.68993,-1.200433,-0.65832,-0.218283,-0.410757,-1.001728,1.237957,-0.695246,-0.21915,-0.974938,-0.72076,-0.892713,0
6,0.824103,-1.444542,0.872408,0.467629,0.397131,-0.410757,1.008496,0.453355,-0.695246,2.18197,2.26033,1.409246,-0.892713,1
7,0.2716,-1.444542,0.872408,-0.65832,2.051054,-0.410757,-1.001728,0.584122,1.433497,-0.390658,-0.974938,-0.72076,-0.892713,0
8,0.934603,0.68993,0.872408,-0.095345,0.127887,-0.410757,1.008496,-0.113302,-0.695246,0.295376,0.642696,0.344243,1.170601,1
9,-0.170401,0.68993,0.872408,0.467629,-0.852928,2.426332,1.008496,0.23541,1.433497,1.753198,2.26033,-0.72076,1.170601,1


Denfines each of our KNN and validates the model

In [3]:
# Testing some of our highest values
test_model(std_df,  ['age','fbs','thalach','ca'], 20, verbose=True)

precision=[0.45112782], recall=[1.], f-score=[0.62176166], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
precision=[0.4494382], recall=[1.], f-score=[0.62015504], support=[120]
 20 | AVG FSCORE:	0.62


0.6203157006868298