In [41]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
# Load the dataset
df = pd.read_csv('drug200.csv')  
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [43]:
df.columns

Index(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K', 'Drug'], dtype='object')

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [45]:
df.describe()

Unnamed: 0,Age,Na_to_K
count,200.0,200.0
mean,44.315,16.084485
std,16.544315,7.223956
min,15.0,6.269
25%,31.0,10.4455
50%,45.0,13.9365
75%,58.0,19.38
max,74.0,38.247


In [46]:
df.dropna()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,DrugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,DrugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [47]:
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64


In [48]:
for i in ['Sex', 'BP', 'Cholesterol', 'Drug']:
    print(df[i].value_counts(), "\n")
    print("*"*200,"\n")

Sex
M    104
F     96
Name: count, dtype: int64 

******************************************************************************************************************************************************************************************************** 

BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64 

******************************************************************************************************************************************************************************************************** 

Cholesterol
HIGH      103
NORMAL     97
Name: count, dtype: int64 

******************************************************************************************************************************************************************************************************** 

Drug
DrugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64 

**************************************************************************************************************

In [49]:
# Encode categorical features
sex = LabelEncoder()
bp = LabelEncoder()
cholestrol = LabelEncoder()
drug = LabelEncoder()

df['Sex'] = sex.fit_transform(df['Sex'])
df['BP'] = bp.fit_transform(df['BP'])
df['Cholesterol'] = cholestrol.fit_transform(df['Cholesterol'])
df['Drug'] = drug.fit_transform(df['Drug'])

df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,0
1,47,1,1,0,13.093,3
2,47,1,1,0,10.114,3
3,28,0,2,0,7.798,4
4,61,0,1,0,18.043,0
...,...,...,...,...,...,...
195,56,0,1,0,11.567,3
196,16,1,1,0,12.006,3
197,52,1,2,0,9.894,4
198,23,1,2,1,14.020,4


In [50]:
X = df.drop("Drug", axis=1)
y = df["Drug"]

# Normalize features
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Convert back to DataFrame for readability
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)

# Combine with target if needed
normalized_df = pd.concat([X_normalized_df, y.reset_index(drop=True)], axis=1)

print(normalized_df.head())

        Age  Sex   BP  Cholesterol   Na_to_K  Drug
0  0.135593  0.0  0.0          0.0  0.596848     0
1  0.542373  1.0  0.5          0.0  0.213397     3
2  0.542373  1.0  0.5          0.0  0.120239     3
3  0.220339  0.0  1.0          0.0  0.047814     4
4  0.779661  0.0  0.5          0.0  0.368191     0


In [51]:
# # Prepare features and target
X = normalized_df.iloc[:, :-1].values  # All columns except the last
y = normalized_df.iloc[:, -1].values   # Only the last column

In [52]:
X

array([[0.13559322, 0.        , 0.        , 0.        , 0.59684783],
       [0.54237288, 1.        , 0.5       , 0.        , 0.21339671],
       [0.54237288, 1.        , 0.5       , 0.        , 0.12023891],
       [0.22033898, 0.        , 1.        , 0.        , 0.04781412],
       [0.77966102, 0.        , 0.5       , 0.        , 0.36819063],
       [0.11864407, 0.        , 1.        , 0.        , 0.07311277],
       [0.57627119, 0.        , 1.        , 0.        , 0.31290262],
       [0.44067797, 1.        , 0.5       , 0.        , 0.14910251],
       [0.76271186, 1.        , 1.        , 0.        , 0.27837889],
       [0.47457627, 1.        , 0.5       , 1.        , 0.40962537],
       [0.54237288, 0.        , 0.5       , 0.        , 0.1719307 ],
       [0.3220339 , 0.        , 0.        , 1.        , 0.40434048],
       [0.47457627, 1.        , 0.5       , 0.        , 0.28478954],
       [1.        , 0.        , 0.5       , 0.        , 0.45884671],
       [0.59322034, 0.        , 1.

In [53]:
y

array([0, 3, 3, 4, 0, 4, 0, 3, 0, 0, 3, 0, 0, 0, 4, 0, 4, 1, 3, 0, 0, 0,
       0, 0, 0, 0, 0, 4, 0, 0, 4, 2, 4, 0, 4, 4, 1, 4, 4, 4, 0, 2, 0, 4,
       4, 4, 1, 3, 0, 0, 0, 4, 0, 0, 2, 3, 2, 0, 4, 0, 0, 1, 0, 4, 2, 0,
       1, 4, 0, 0, 2, 0, 4, 0, 0, 0, 1, 0, 1, 4, 2, 4, 3, 1, 3, 2, 4, 0,
       0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 1, 1, 3, 4, 0, 4, 4, 0, 2, 0,
       1, 4, 4, 4, 4, 0, 4, 4, 1, 0, 0, 0, 0, 0, 2, 0, 0, 4, 0, 4, 0, 0,
       4, 0, 0, 4, 2, 1, 2, 4, 1, 0, 2, 0, 1, 4, 4, 1, 4, 3, 1, 2, 4, 4,
       0, 3, 1, 0, 3, 4, 4, 2, 4, 0, 0, 0, 0, 4, 0, 1, 4, 4, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 4, 4, 0, 0, 0, 2, 1, 0, 0, 0, 1, 0, 3, 0, 3, 3, 4,
       4, 4])

Classic ML models

In [54]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Training the Decision Tree classifier

In [55]:
model1 = DecisionTreeClassifier(random_state=42)
model1.fit(X_train, y_train)

y_pred_dt = model1.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_classification_report = classification_report(y_test, y_pred_dt)
dt_confusion_matrix = confusion_matrix(y_test, y_pred_dt)

print("Accuracy:", dt_accuracy)
print("\nClassification Report:\n", dt_classification_report)
print("\nConfusion Matrix:\n",dt_confusion_matrix )


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Confusion Matrix:
 [[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]


In [56]:
model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(X_train, y_train)

y_pred_knn = model2.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_classification_report = classification_report(y_test, y_pred_knn)
knn_confusion_matrix = confusion_matrix(y_test, y_pred_knn)

print("Accuracy:", knn_accuracy)
print("\nClassification Report:\n", knn_classification_report)
print("\nConfusion Matrix:\n",knn_confusion_matrix )



Accuracy: 0.85

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.73      0.79        15
           1       0.75      1.00      0.86         6
           2       1.00      1.00      1.00         3
           3       1.00      0.60      0.75         5
           4       0.85      1.00      0.92        11

    accuracy                           0.85        40
   macro avg       0.89      0.87      0.86        40
weighted avg       0.86      0.85      0.84        40


Confusion Matrix:
 [[11  2  0  0  2]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 2  0  0  3  0]
 [ 0  0  0  0 11]]


In [57]:
model3 = RandomForestClassifier(n_estimators=100, random_state=42)
model3.fit(X_train, y_train)


y_pred_rf = model3.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_classification_report = classification_report(y_test, y_pred_rf)
rf_confusion_matrix = confusion_matrix(y_test, y_pred_rf)

print("Accuracy:", rf_accuracy)
print("\nClassification Report:\n", rf_classification_report)
print("\nConfusion Matrix:\n",rf_confusion_matrix )


Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         3
           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00        11

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40


Confusion Matrix:
 [[15  0  0  0  0]
 [ 0  6  0  0  0]
 [ 0  0  3  0  0]
 [ 0  0  0  5  0]
 [ 0  0  0  0 11]]


In [58]:
model4 = SVC(kernel='linear')
model4.fit(X_train, y_train)


y_pred_svm = model4.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_classification_report = classification_report(y_test, y_pred_svm)
svm_confusion_matrix = confusion_matrix(y_test, y_pred_svm)

print("Accuracy:", svm_accuracy)
print("\nClassification Report:\n", svm_classification_report)
print("\nConfusion Matrix:\n",svm_confusion_matrix )



Accuracy: 0.825

Classification Report:
               precision    recall  f1-score   support

           0       0.68      1.00      0.81        15
           1       1.00      0.67      0.80         6
           2       1.00      0.67      0.80         3
           3       1.00      0.20      0.33         5
           4       1.00      1.00      1.00        11

    accuracy                           0.82        40
   macro avg       0.94      0.71      0.75        40
weighted avg       0.88      0.82      0.80        40


Confusion Matrix:
 [[15  0  0  0  0]
 [ 2  4  0  0  0]
 [ 1  0  2  0  0]
 [ 4  0  0  1  0]
 [ 0  0  0  0 11]]


In [59]:
models = ["Decision Tree", "KNN", "Random Forest", "SVM"]
accuracies = [
    accuracy_score(y_test, y_pred_dt),
    accuracy_score(y_test, y_pred_knn),
    accuracy_score(y_test, y_pred_rf),
    accuracy_score(y_test, y_pred_svm)
]
precisions_weighted = [
    precision_score(y_test, y_pred_dt, average="weighted", zero_division=0),
    precision_score(y_test, y_pred_knn, average="weighted", zero_division=0),
    precision_score(y_test, y_pred_rf, average="weighted", zero_division=0),
    precision_score(y_test, y_pred_svm, average="weighted", zero_division=0)
]
recalls_weighted = [
    recall_score(y_test, y_pred_dt, average="weighted", zero_division=0),
    recall_score(y_test, y_pred_knn, average="weighted", zero_division=0),
    recall_score(y_test, y_pred_rf, average="weighted", zero_division=0),
    recall_score(y_test, y_pred_svm, average="weighted", zero_division=0)
]
f1s_weighted = [
    f1_score(y_test, y_pred_dt, average="weighted", zero_division=0),
    f1_score(y_test, y_pred_knn, average="weighted", zero_division=0),
    f1_score(y_test, y_pred_rf, average="weighted", zero_division=0),
    f1_score(y_test, y_pred_svm, average="weighted", zero_division=0)
]
precisions_macro = [
    precision_score(y_test, y_pred_dt, average="macro", zero_division=0),
    precision_score(y_test, y_pred_knn, average="macro", zero_division=0),
    precision_score(y_test, y_pred_rf, average="macro", zero_division=0),
    precision_score(y_test, y_pred_svm, average="macro", zero_division=0)
]
recalls_macro = [
    recall_score(y_test, y_pred_dt, average="macro", zero_division=0),
    recall_score(y_test, y_pred_knn, average="macro", zero_division=0),
    recall_score(y_test, y_pred_rf, average="macro", zero_division=0),
    recall_score(y_test, y_pred_svm, average="macro", zero_division=0)
]
f1s_macro = [
    f1_score(y_test, y_pred_dt, average="macro", zero_division=0),
    f1_score(y_test, y_pred_knn, average="macro", zero_division=0),
    f1_score(y_test, y_pred_rf, average="macro", zero_division=0),
    f1_score(y_test, y_pred_svm, average="macro", zero_division=0)
]


# Build the summary DataFrame
summary_df = pd.DataFrame({
    "Model": models,
    "Accuracy": accuracies,
    "Precision_weighted": precisions_weighted,
    "Recal_weighted": recalls_weighted,
    "F1 Score_weighted": f1s_weighted,
    "Precision_macro": precisions_weighted,
    "Recal_macro": recalls_weighted,
    "F1 Score_macro": f1s_weighted

})

# Round for better readability
summary_df = summary_df.round(3)
summary_df


Unnamed: 0,Model,Accuracy,Precision_weighted,Recal_weighted,F1 Score_weighted,Precision_macro,Recal_macro,F1 Score_macro
0,Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,KNN,0.85,0.862,0.85,0.844,0.862,0.85,0.844
2,Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,SVM,0.825,0.881,0.825,0.801,0.881,0.825,0.801
