In [71]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
# from sklearn.model_selection import train_test_split

In [51]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [52]:
# Load training data
df_train = pd.read_csv('dataset_2/Training.csv')

In [53]:
# Visualize first 5 samples of dataset2
df_train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis,Unnamed: 133
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Fungal infection,


In [54]:
# Drop the last unused coloum in dataset2 
df_train = df_train.drop(df_train.columns[-1], axis=1)

In [55]:
# Convert label column from string to int
df_train["prognosis"], unique_labels = pd.factorize(df_train["prognosis"])

# Print the mapping of disease names to integer labels
label_mapping = dict(zip(unique_labels, range(len(unique_labels))))
label_mapping = {disease_name: label_index for label_index, disease_name in label_mapping.items()}

print("This is the label mapping: ")
print(label_mapping)

This is the label mapping: 
{0: 'Fungal infection', 1: 'Allergy', 2: 'GERD', 3: 'Chronic cholestasis', 4: 'Drug Reaction', 5: 'Peptic ulcer diseae', 6: 'AIDS', 7: 'Diabetes ', 8: 'Gastroenteritis', 9: 'Bronchial Asthma', 10: 'Hypertension ', 11: 'Migraine', 12: 'Cervical spondylosis', 13: 'Paralysis (brain hemorrhage)', 14: 'Jaundice', 15: 'Malaria', 16: 'Chicken pox', 17: 'Dengue', 18: 'Typhoid', 19: 'hepatitis A', 20: 'Hepatitis B', 21: 'Hepatitis C', 22: 'Hepatitis D', 23: 'Hepatitis E', 24: 'Alcoholic hepatitis', 25: 'Tuberculosis', 26: 'Common Cold', 27: 'Pneumonia', 28: 'Dimorphic hemmorhoids(piles)', 29: 'Heart attack', 30: 'Varicose veins', 31: 'Hypothyroidism', 32: 'Hyperthyroidism', 33: 'Hypoglycemia', 34: 'Osteoarthristis', 35: 'Arthritis', 36: '(vertigo) Paroymsal  Positional Vertigo', 37: 'Acne', 38: 'Urinary tract infection', 39: 'Psoriasis', 40: 'Impetigo'}


In [56]:
# Visualize the dataset after pre-processing
df_train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Assign samples and labels to X and y
X_train = df_train.iloc[:, :-1]
y_train = df_train.iloc[:, -1]

In [58]:
# Split train and test dataset
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [59]:
# Initializing k-means model 
knn = KNeighborsClassifier(n_neighbors=4, weights="uniform")
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [60]:
# load testing data
df_test = pd.read_csv('dataset_2/Testing.csv')

# Convert label column from string to int
df_test["prognosis"], unique_labels = pd.factorize(df_test["prognosis"])

# Print the mapping of disease names to integer labels
label_mapping_test = dict(zip(unique_labels, range(len(unique_labels))))
label_mapping_test = {disease_name: label_index for label_index, disease_name in label_mapping.items()}

In [61]:
# Assing testing samples to X and y
X_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]

In [62]:
# Get accuracy score for testing data
knn.score(X_test,y_test)

1.0

In [63]:
# Selecting parameters with best accuracy
param_grid = [
    {
        'weights': ['uniform'], 
        'n_neighbors': [i for i in range(1, 7)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 7)], 
        'p': [i for i in range(1, 6)]
    }
]

knn_model_select = KNeighborsClassifier()
grid_search = GridSearchCV(knn_model_select, param_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}])

In [64]:
grid_search.best_estimator_

KNeighborsClassifier(n_neighbors=1)

In [65]:
grid_search.best_score_

1.0

In [66]:
# Testing different values of n_neighbors 
for i in range(1, 10):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    score = knn.score(X_test, y_test)
    distances, indices = knn.kneighbors(X_test)
    print(score)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [68]:
decesion_tree = tree.DecisionTreeClassifier()
decesion_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [70]:
y_predict = decesion_tree.predict(X_test)
score = accuracy_score(y_test, y_predict)
print(score)


0.9761904761904762


In [72]:
param_grid = [
    {
        'criterion': ['gini'], 
        'splitter': ['best'],
        'min_samples_split': [i for i in range(2, 5)]
    },
    {
        'criterion': ['entropy'],
        'splitter': ['best'],
        'min_samples_split': [i for i in range(2, 5)]
    }
]
decesion_tree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(decesion_tree, param_grid)
grid_search.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid=[{'criterion': ['gini'], 'min_samples_split': [2, 3, 4],
                          'splitter': ['best']},
                         {'criterion': ['entropy'],
                          'min_samples_split': [2, 3, 4],
                          'splitter': ['best']}])

In [73]:
grid_search.best_estimator_

DecisionTreeClassifier()

In [74]:
grid_search.best_score_

1.0