## Data clearing


In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
df = pd.read_csv(r'ObesityDataSet_raw_and_data_sinthetic.csv')
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# Dropping uninformative columns

columns_to_drop = ['Gender', 'Age', 'Height', 'Weight', 'SCC']
df.drop(columns_to_drop, axis=1, inplace=True)

In [4]:
#Replacing written data with integers 

boolean_columns = ['family_history_with_overweight', 'FAVC', 'SMOKE']
for column in boolean_columns:   
    df[column] = df[column].map({'no': 0, 'yes': 1})

CAEC_replacement = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CAEC'] = df['CAEC'].map(CAEC_replacement)

CALC_replacement = {'no': 0,'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CALC'] = df['CALC'].map(CALC_replacement)

MTRANS_replacement = {'Walking': 1, 'Bike':2, 'Public_Transportation': 3, 'Motorbike': 4, 'Automobile': 5}
df['MTRANS'] = df['MTRANS'].map(MTRANS_replacement)

# There is not enough data to have 7 classes, I'm going to merge some classes together
CLASS_LABEL_replacement = {'Insufficient_Weight': 1, 'Normal_Weight': 2, 'Overweight_Level_I': 3, 'Overweight_Level_II': 3,
       'Obesity_Type_I': 4, 'Obesity_Type_II': 4,
       'Obesity_Type_III': 4}
df['NObeyesdad'] = df['NObeyesdad'].map(CLASS_LABEL_replacement)

df.head()

Unnamed: 0,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,FAF,TUE,CALC,MTRANS,NObeyesdad
0,1,0,2.0,3.0,1,0,2.0,0.0,1.0,0,3,2
1,1,0,3.0,3.0,1,1,3.0,3.0,0.0,1,3,2
2,1,0,2.0,3.0,1,0,2.0,2.0,1.0,2,3,2
3,0,0,3.0,3.0,1,0,2.0,2.0,0.0,2,1,3
4,0,0,2.0,1.0,1,0,2.0,0.0,0.0,1,3,3


## Building the model

In [5]:
# Splitting data

X = df.iloc[:, 0:11]
Y = df.iloc[:, 11]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size = 0.2)

In [6]:
# Scaling data

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [7]:
# Choosing n_neighbours

len(Y)**0.5
#45.9 rounded to an odd = 45

45.94562003064057

In [8]:
# KNN model init

classifier = KNeighborsClassifier(n_neighbors = 45, p=2, metric='minkowski', leaf_size=30, weights='uniform')
print(classifier)

KNeighborsClassifier(n_neighbors=45)


In [9]:
# Fitting the model

classifier.fit(X_train, Y_train)

KNeighborsClassifier(n_neighbors=45)

In [10]:
# Using model to predict

Y_pred = classifier.predict(X_test)

In [11]:
# Model evaluation

cm = confusion_matrix(Y_test, Y_pred)
print(cm)
print(accuracy_score(Y_test, Y_pred))

[[ 53   1   6   5]
 [ 11  10  16  20]
 [ 10   5  44  53]
 [  0   0  11 178]]
0.6737588652482269


## Improving the model

In [12]:
# Uniform or distance weight

#uniform
classifier_uniform = KNeighborsClassifier(n_neighbors = 45, p=2, metric='minkowski', leaf_size=30, weights='uniform')
classifier_uniform.fit(X_train, Y_train)

Y_pred = classifier_uniform.predict(X_test)
print(accuracy_score(Y_test, Y_pred), 'uniform')

# distance
classifier_distance = KNeighborsClassifier(n_neighbors = 45, p=2, metric='minkowski', leaf_size=30, weights='distance')
classifier_distance.fit(X_train, Y_train)

Y_pred = classifier_distance.predict(X_test)
print(accuracy_score(Y_test, Y_pred), 'distance')


0.6737588652482269 uniform
0.7115839243498818 distance


In [13]:
# Choosing the right k value
result = []

for i in range(11, 60):
    classifier = KNeighborsClassifier(n_neighbors = i, p=2, metric='minkowski', leaf_size=30, weights='distance')
    classifier.fit(X_train, Y_train)
    
    Y_pred = classifier.predict(X_test)
    result.append([accuracy_score(Y_test, Y_pred), i])
    
print(max(result))

[0.7328605200945626, 12]


In [14]:
# Choosing the right p value
result = []

for i in range(10, 51, 1):
    #range function takes only intergers, multiplication by 10 fixes the problem
    i = i/10
    
    classifier = KNeighborsClassifier(n_neighbors = 12, p=i, metric='minkowski', leaf_size=30, weights='distance')
    classifier.fit(X_train, Y_train)
    
    Y_pred = classifier.predict(X_test)
    result.append([accuracy_score(Y_test, Y_pred), i])

print(max(result))

[0.75177304964539, 1.0]
