In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [46]:
df=pd.read_csv("drug200.csv")
df.head()


Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY


In [47]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [48]:
df.isnull().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [None]:
df.describe()

In [None]:
corr_metrics=df.corr(numeric_only=True)
sns.heatmap(corr_metrics,annot=True)

In [None]:
df.info()

In [None]:
X=df.drop('Drug',axis=1)
y=df['Drug']

In our Target columns we have categorical data so before split we have to performe LabelEncoding because we have small categorical data and we don't have to create saperate columns for each category

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_encoded=le.fit_transform(y)

Doing it before the split ensures both y_train and y_test share the same encoding map.

Train Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
X_test.shape , X_train.shape , y_test.shape , y_train.shape

In [None]:
df.head(3)

In [None]:
num_col=['Age','Na_to_K']
cat_col=['Sex','BP','Cholesterol']


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
preprocessor = ColumnTransformer(
    transformers=[
        ('num_col',StandardScaler(),num_col),
        ('cat_col',OneHotEncoder(handle_unknown='ignore'),cat_col)
    ]
)
model=Pipeline(steps=[
    ('Preprocessor',preprocessor),
    ('KNN',KNeighborsClassifier(n_neighbors=5))
]
)
# Here we have fit whole data which can lead us to Data Leakage (Accuracy was 0.975) (Have a look of Logistic_Regression)
'model.fit(df[num_col+cat_col],y)'
# Now we have to fit X_train and y_train data to avoid Data Leakage (Accuracy was 0.825)
model.fit(X_train,y_train)

Prediction

In [None]:
y_pred=model.predict(X_test)
y_proba=model.predict_proba(X_test)[:,1]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
print("Accuracy", accuracy_score(y_test,y_pred))
'''print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1       :", f1_score(y_test, y_pred))
print("ROC AUC  :", roc_auc_score(y_test, y_proba))'''



print("\nClassification report:\n", classification_report(y_test, y_pred))

Prediction for 1 data

In [None]:
new_app=df.iloc[[3]]
pred_prob = model.predict_proba(new_app)[:, 1]
pred = model.predict(new_app)
print(f"Approval probability: {pred_prob[0]:.3f}")
print("Predicted class:", pred[0])

In [None]:
import pickle
with open("model.pkl","wb") as f:
    pickle.dump(model,f)