# **Drug Classification**

## Importing modules

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [34]:
df = pd.read_csv('/content/drug200.csv')

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [53]:
df['Drug'].value_counts()

Unnamed: 0_level_0,count
Drug,Unnamed: 1_level_1
DrugY,91
drugX,54
drugA,23
drugC,16
drugB,16


In [37]:
from sklearn.preprocessing import LabelEncoder
columns_to_be_encoded = ["Sex", "BP", "Cholesterol", "Drug"]
df_encoded = df.copy()
le = LabelEncoder()

for column in columns_to_be_encoded:
    df_encoded[column] = le.fit_transform(df_encoded[column])

display(df_encoded.sample(5))

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
148,61,0,1,1,7.34,4
121,15,1,0,1,17.206,0
55,26,0,1,0,14.16,3
160,30,0,2,0,10.443,4
184,18,0,0,0,37.188,0


In [94]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = df_encoded.drop('Drug', axis=1)
y = df_encoded['Drug']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45, stratify=y)

##KNN

In [95]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [96]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [97]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [98]:
y_pred = knn.predict(X_test)

#Evaluating KNN
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.6750

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.83      0.86        18
           1       0.67      0.40      0.50         5
           2       0.25      0.67      0.36         3
           3       1.00      0.33      0.50         3
           4       0.64      0.64      0.64        11

    accuracy                           0.68        40
   macro avg       0.69      0.57      0.57        40
weighted avg       0.75      0.68      0.69        40


Confusion Matrix:
 [[15  0  2  0  1]
 [ 1  2  0  0  2]
 [ 0  0  2  0  1]
 [ 0  1  1  1  0]
 [ 1  0  3  0  7]]


### Hence KNN is a bad model for this dataset because there categorical features which by doing label encoding have no significance

##    **Decision** **Tree**

In [111]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()

In [112]:
tree.fit(X_train, y_train)

In [113]:
y_pred = tree.predict(X_test)

In [114]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Confusion Matrix:
 [[18  0  0  0  0]
 [ 0  5  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0 11]]


## **Random Forest**

In [115]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=20)

In [116]:
forest.fit(X_train, y_train)

In [117]:
y_pred = forest.predict(X_test)

In [118]:
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Confusion Matrix:
 [[18  0  0  0  0]
 [ 0  5  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  3  0]
 [ 0  0  0  0 11]]
