In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('drug200.csv')
df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [85]:
df = pd.DataFrame(df)

# Initialize LabelEncoder
le = LabelEncoder()

# Apply LabelEncoder to categorical columns
df['BP'] = le.fit_transform(df['BP'])
df['Cholesterol'] = le.fit_transform(df['Cholesterol'])
df['Drug'] = le.fit_transform(df['Drug'])

# One-hot encode categorical columns
df['Sex'] = df['Sex'].apply(lambda x: 1 if x == 'M' else 0)
print(df)

     Age  Sex  BP  Cholesterol  Na_to_K  Drug
0     23    0   0            0   25.355     4
1     47    1   1            0   13.093     2
2     47    1   1            0   10.114     2
3     28    0   2            0    7.798     3
4     61    0   1            0   18.043     4
..   ...  ...  ..          ...      ...   ...
195   56    0   1            0   11.567     2
196   16    1   1            0   12.006     2
197   52    1   2            0    9.894     3
198   23    1   2            1   14.020     3
199   40    0   1            1   11.349     3

[200 rows x 6 columns]


In [95]:
from sklearn.preprocessing import StandardScaler
# Initialize StandardScaler
scaler = StandardScaler()

# Select the numerical columns to standardize
numerical_columns = ['Age', 'Na_to_K']

# Apply standardization
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

print(df)

          Age  Sex  BP  Cholesterol   Na_to_K  Drug
0   -1.291591    0   0            0  1.286522     4
1    0.162699    1   1            0 -0.415145     2
2    0.162699    1   1            0 -0.828558     2
3   -0.988614    0   2            0 -1.149963     3
4    1.011034    0   1            0  0.271794     4
..        ...  ...  ..          ...       ...   ...
195  0.708057    0   1            0 -0.626917     2
196 -1.715759    1   1            0 -0.565995     2
197  0.465676    1   2            0 -0.859089     3
198 -1.291591    1   2            1 -0.286500     3
199 -0.261469    0   1            1 -0.657170     3

[200 rows x 6 columns]


In [97]:
from sklearn.model_selection import train_test_split

X = df.drop('Drug', axis=1)  # Independent variables
y = df['Drug']               # Target variable

# Split the data (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set:")
print(X_train.shape, y_train.shape)

print("Test set:")
print(X_test.shape, y_test.shape)

Training set:
(160, 5) (160,)
Test set:
(40, 5) (40,)


In [99]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score


knn = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier(random_state=42)
svm = SVC(random_state=42)
logistic_regression = LogisticRegression(max_iter=1000, random_state=42)

# Train and evaluate each model
models = {
    'KNN': knn,
    'Decision Tree': decision_tree,
    'SVM': svm,
    'Logistic Regression': logistic_regression
}

for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Print evaluation metrics
    print(f"Model: {name}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print("\n" + "="*40 + "\n")

Model: KNN
Accuracy: 0.9
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.60      1.00      0.75         3
           2       1.00      0.60      0.75         5
           3       0.83      0.91      0.87        11
           4       1.00      0.93      0.97        15

    accuracy                           0.90        40
   macro avg       0.89      0.89      0.87        40
weighted avg       0.92      0.90      0.90        40



Model: Decision Tree
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         5
           3       1.00      1.00      1.00        11
           4       1.00      1.00      1.00        15

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg   