<div style="color:black;
            display:fill;
            border-radius:10px;
            background-color:#ff8c00; /* Dark Orange */
            font-size:140%;
            font-family:Verdana;
            letter-spacing:1px;
            padding: 20px;
            text-align:center;
            font-weight: bold;">
   Drug Classifier - Model Benchmarking
</div>


### **Importing Sufficient Libraries**

In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.utils import resample
from sklearn.linear_model import Ridge
from collections import Counter
import xgboost as xgb
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

### **Exploratory Data Analysis (EDA)**

In [38]:
df = pd.read_csv(r"C:\Users\user\Downloads\drug200.csv")

label_encoder = LabelEncoder()

df['Sex'] = label_encoder.fit_transform(df['Sex'])
df['BP'] = label_encoder.fit_transform(df['BP'])
df['Cholesterol'] = label_encoder.fit_transform(df['Cholesterol'])

df['Drug'] = label_encoder.fit_transform(df['Drug'])

print("Unique values in 'Drug':", df['Drug'].unique())

Unique values in 'Drug': [4 2 3 0 1]


In [39]:
df.head(10)

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
5,22,0,2,0,8.607,3
6,49,0,2,0,16.275,4
7,41,1,1,0,11.037,2
8,60,1,2,0,15.171,4
9,43,1,1,1,19.368,4


### **Model Benchmarking**

### Logistic Regression

In [40]:
X = df.drop('Drug', axis=1)

Y = df['Drug']

In [61]:
class Logistic_Regression():
    def __init__(self, learning_rate=0.01, no_of_iterations=1000, C=1.0):
        self.learning_rate = learning_rate
        self.no_of_iterations = no_of_iterations
        self.C = C

    def fit(self, X, Y):
        self.m, self.n = X.shape
        self.X = X
        self.Y = Y
        self.w = np.zeros(self.n)
        self.b = 0
        
        for i in range(self.no_of_iterations):
            self.update_weight()

    def update_weight(self):
        z = np.dot(self.X, self.w) + self.b
        Y_hat = 1 / (1 + np.exp(-z))

        dw = (1 / self.m) * np.dot(self.X.T, (Y_hat - self.Y)) + (self.C / self.m) * self.w
        db = (1 / self.m) * np.sum(Y_hat - self.Y)

        self.w = self.w - self.learning_rate * dw
        self.b = self.b - self.learning_rate * db

    def predict(self, X):
        z = np.dot(X, self.w) + self.b
        Y_prediction = 1 / (1 + np.exp(-z))
        Y_prediction = np.where(Y_prediction > 0.5, 1, 0)

        return Y_prediction

In [62]:
scaler = StandardScaler()

scaler.fit(X)

standardized_data = scaler.transform(X)

In [63]:
X = df[['Sex', 'BP', 'Cholesterol', 'Na_to_K']]
Y = df['Drug']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 123)

In [65]:
param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'no_of_iterations': [500, 1000, 2000],
    'C': [0.1, 1.0, 10.0]
}

param_combinations = list(itertools.product(
    param_grid['learning_rate'],
    param_grid['no_of_iterations'],
    param_grid['C']
))

best_accuracy = 0
best_params = None

for learning_rate, no_of_iterations, C in param_combinations:
    model = Logistic_Regression(learning_rate=learning_rate, no_of_iterations=no_of_iterations, C=C)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    LR_accuracy = accuracy_score(Y_test, Y_pred)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {'learning_rate': learning_rate, 'no_of_iterations': no_of_iterations, 'C': C}

print("Best parameters for Logistic Regression:", best_params)
print("Best accuracy score for Logistic Regression:", best_accuracy)

Best parameters for Logistic Regression: {'learning_rate': 0.001, 'no_of_iterations': 500, 'C': 0.1}
Best accuracy score for Logistic Regression: 0.1


### Decision Tree

In [9]:
class DecisionNode():
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (self.max_depth is not None and depth >= self.max_depth or 
            n_labels == 1 or 
            n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        best_feature, best_threshold = self._best_split(X, y, n_features)

        left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
        left_child = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right_child = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
        return DecisionNode(best_feature, best_threshold, left_child, right_child)

    def _best_split(self, X, y, n_features):
        best_gini = 1.0
        best_feature, best_threshold = None, None

        for feature_index in range(n_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                gini = self._gini_index(X[:, feature_index], y, threshold)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def _gini_index(self, feature_column, y, threshold):
        left_indices = feature_column <= threshold
        right_indices = feature_column > threshold
        left_gini = self._gini(y[left_indices])
        right_gini = self._gini(y[right_indices])
        gini = (len(y[left_indices]) * left_gini + len(y[right_indices]) * right_gini) / len(y)
        return gini

    def _gini(self, y):
        proportions = np.bincount(y) / len(y)
        return 1 - np.sum(proportions ** 2)

    def _split(self, feature_column, threshold):
        left_indices = np.where(feature_column <= threshold)[0]
        right_indices = np.where(feature_column > threshold)[0]
        return left_indices, right_indices

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

In [10]:
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

print("Best parameters for Decision Tree:", grid_search.best_params_)
print("Best score for Decision Tree:", grid_search.best_score_)

best_dt_model = grid_search.best_estimator_

Y_pred = best_dt_model.predict(X_test)
DT_accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy score of Decision Tree is: {DT_accuracy}')

Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best score for Decision Tree: 0.9071428571428571
Accuracy score of Decision Tree is: 0.9166666666666666


### Random Forest

In [11]:
class RandomForest:
    def __init__(self, n_trees=10, max_depth=10, min_samples_split=2, max_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        X = X.to_numpy() 
        y = y.to_numpy() 
        for _ in range(self.n_trees):
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        indices = resample(np.arange(n_samples), replace=True, n_samples=n_samples)
        return X[indices], y[indices]

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        X = X.to_numpy()  
        tree_predictions = np.array([tree.predict(X) for tree in self.trees])
        tree_predictions = np.swapaxes(tree_predictions, 0, 1)
        return np.array([self._most_common_label(tree_preds) for tree_preds in tree_predictions])

    def get_params(self, deep=True):
        return {
            'n_trees': self.n_trees,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'max_features': self.max_features
        }

    def set_params(self, **params):
        for key, value in params.items():
            setattr(self, key, value)
        return self

In [26]:
param_grid = {
    'n_trees': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}


grid_search = GridSearchCV(estimator=RandomForest(), param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, Y_train)

print("Best parameters for Random Forest:", grid_search.best_params_) 
print("Best score for Random Forest:", grid_search.best_score_)
best_rf = grid_search.best_estimator_

Y_pred = best_rf.predict(X_test)

RF_accuracy = accuracy_score(Y_test, Y_pred)

Best parameters for Random Forest: {'max_depth': 10, 'min_samples_split': 5, 'n_trees': 100}
Best score for Random Forest: 0.9


### XGBoost

In [19]:
min_label = np.min(Y_train)
if min_label < 0:
    Y_train = Y_train - min_label
    Y_test = Y_test - min_label
    
num_classes = len(np.unique(Y_train))

dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test, label=Y_test)

param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes),
                           param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, Y_train)

print("Best parameters for XGBoost:", grid_search.best_params_)
print("Best score for XGBoost:", grid_search.best_score_)

best_xgb_model = grid_search.best_estimator_

Y_pred = best_xgb_model.predict(X_test)
xgb_accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy score of XGBoost model is: {xgb_accuracy}')

Best parameters for XGBoost: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}
Best score for XGBoost: 0.9071428571428571
Accuracy score of XGBoost model is: 0.9166666666666666


### Model Evaluation Results

In [27]:
results = [
    {'Model': 'Logistic Regression', 'Accuracy': LR_accuracy},
    {'Model': 'Decision Tree', 'Accuracy': DT_accuracy},
    {'Model': 'Random Forest', 'Accuracy': RF_accuracy},
    {'Model': 'XGBoost', 'Accuracy': xgb_accuracy}
]
df_results = pd.DataFrame(results)

print(df_results)

                 Model  Accuracy
0  Logistic Regression  0.100000
1        Decision Tree  0.916667
2        Random Forest  0.883333
3              XGBoost  0.916667
