In [None]:
#KNN classifier standard implementation using scikit-learn.
#KNN stands for K-Nearest Neighbors, a simple and effective machine learning algorithm used for classification and regression tasks. 
#KNN searches for K nearest data points in the training set to make predictions for new data points. Since it useds the entire training dataset during prediction, it is considered a lazy learning algorithm.
#There are many ways to calculate the distance between data points, with Euclidean distance being the most commonly used metric


#Data Set Description:
#DataSet consists of features such as Age, BP, cholestrol, Na_to_K and a target variable Drug which indicates the type of drug prescribed.
#The objective is to classify the type of drug based on the given features using KNN classifier.

#Model Implementation:
#1. The dataset consisted of categorical and numerical features. Categorical features were encoded using one-hot encoding.
#2. The data set was not balanced, so SMOTE (Synthetic Minority Over-sampling Technique) was applied to balance the classes.
#3. The dataset so obtainsed was split into training and testing sets using an 80-20 split.
#4. A KNN classifier was instantiated with n_neighbors set to 5.

#Model evaluation:
#1. The predicted model is then evaluated using accuracy score, confusion matrix, and classification report.


In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('C:/Users/sahil/Downloads/archive (2)/drug200.csv')

In [3]:
data.head(n = 4)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX


In [4]:
data.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [60]:
Object_data = data.select_dtypes(include = 'object')
for i in Object_data.columns:
    print(Object_data[i].value_counts())

Sex
M    104
F     96
Name: count, dtype: int64
BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64
Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64
Drug
DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64


In [124]:
#Prepare for Models for Comparison

#Create x and y variables
x = data.drop('Drug', axis=1).to_numpy()
Y = data['Drug'].to_numpy()

#Load Library for Training
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

In [125]:
x_train_df = pd.DataFrame(x_train)
x_train_df.columns = ['Age', 'Sex', "BP", "Cholestrol", "Na_to_K"]
x_train_df["Na_to_K"] = x_train_df["Na_to_K"].astype(float)
x_train_df["Age"] = x_train_df["Age"].astype(float)

In [133]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
float_cols = x_train_df.select_dtypes(exclude='object').columns
x_train_df[float_cols] = scaler.fit_transform(x_train_df[float_cols])
x_train_df[float_cols] = scaler.transform(x_train_df[float_cols])


In [134]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop='first')
data_object = x_train_df.select_dtypes(include = 'object')
ohe.fit(data_object)
codes = ohe.transform(data_object).toarray()
feature_names = ohe.get_feature_names_out(data_object.columns)


data_ohe = pd.concat([x_train_df.select_dtypes(exclude='object'), 
               pd.DataFrame(codes,columns=feature_names).astype(int)], axis=1)

data_ohe.head()

Unnamed: 0,Age,Na_to_K,Sex_M,BP_LOW,BP_NORMAL,Cholestrol_NORMAL
0,-2.596819,-2.349786,1,1,0,0
1,-2.600376,-2.240916,0,0,1,0
2,-2.724864,-2.349767,0,0,0,1
3,-2.589705,-2.27832,1,0,0,0
4,-2.735535,-1.977313,0,0,0,0


In [139]:
x_train = data_ohe.to_numpy()

In [140]:
print('Class Split')


finalData = pd.DataFrame(y_train)
print(finalData[0].value_counts())

Class Split
0
DrugY    73
drugX    43
drugA    18
drugC    13
drugB    13
Name: count, dtype: int64


In [141]:
#Fix the imbalanced Classes
from imblearn.over_sampling import SMOTE
smt=SMOTE(random_state=100)
x_train_mod,y_train_mod = smt.fit_resample(x_train,y_train)

In [142]:
print('Class Split')


finalData = pd.DataFrame(y_train_mod)
print(finalData[0].value_counts())

Class Split
0
drugC    73
DrugY    73
drugA    73
drugB    73
drugX    73
Name: count, dtype: int64


In [158]:
from sklearn.neighbors import KNeighborsClassifier

# Select the Number of Neighbors ('k')
k = 3
# Choose a Distance Metric
distance_metric = 'euclidean'

# Initialize the k-NN Classifier
knn_clf = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)

# "Train" the kNN (although no real training happens)
knn_clf.fit(x_train_mod, y_train_mod)

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [159]:
x_test_df = pd.DataFrame(x_test)
x_test_df.columns = ['Age', 'Sex', "BP", "Cholestrol", "Na_to_K"]
x_test_df["Na_to_K"] = x_test_df["Na_to_K"].astype(float)
x_test_df["Age"] = x_test_df["Age"].astype(float)

float_cols = x_test_df.select_dtypes(exclude='object').columns
x_test_df[float_cols] = scaler.transform(x_test_df[float_cols])

data_object = x_test_df.select_dtypes(include = 'object')
codes = ohe.transform(data_object).toarray()
feature_names = ohe.get_feature_names_out(data_object.columns)


data_ohe_test = pd.concat([x_test_df.select_dtypes(exclude='object'), 
               pd.DataFrame(codes,columns=feature_names).astype(int)], axis=1)

data_ohe_test.head()
x_test = data_ohe_test.to_numpy()

ValueError: Length mismatch: Expected axis has 6 elements, new values have 5 elements

In [154]:
y_pred = knn_clf.predict(x_test)

In [155]:
#Base Logistical Regression Model
from sklearn.metrics import classification_report, confusion_matrix  

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  

[[18  0  0  0  0]
 [ 4  0  1  0  0]
 [ 2  0  1  0  0]
 [ 3  0  0  0  0]
 [11  0  0  0  0]]
              precision    recall  f1-score   support

       DrugY       0.47      1.00      0.64        18
       drugA       0.00      0.00      0.00         5
       drugB       0.50      0.33      0.40         3
       drugC       0.00      0.00      0.00         3
       drugX       0.00      0.00      0.00        11

    accuracy                           0.47        40
   macro avg       0.19      0.27      0.21        40
weighted avg       0.25      0.47      0.32        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
