In [None]:
#Importing necesssary Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

In [None]:
#Importing all required models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
#Uploading the dataset
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv('preprocessed_data.csv')
df.shape

In [None]:
df.head()

In [None]:
feature_cols = [col for col in df.columns if col != 'Drug']
X = df[feature_cols]
y = df['Drug']

In [None]:
# ===================================================
# SECTION 3: TRAIN-TEST SPLIT & SCALING
# ===================================================
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
# ===================================================
# SECTION 4: CLASSIFICATION MODELS
# ===================================================
print("\n" + "="*80)
print("SECTION 4: TRAINING CLASSIFICATION MODELS")
print("="*80)
print("Target: Predict categorical class labels")
print("-"*80)

classification_results = {}

In [None]:
# 1. LOGISTIC REGRESSION
print("\n1. Logistic Regression")
print("-"*40)
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

classification_results['Logistic Regression'] = {
    'Accuracy': accuracy_score(y_test, y_pred_lr),
    'Precision': precision_score(y_test, y_pred_lr, average='weighted'),
    'Recall': recall_score(y_test, y_pred_lr, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_lr, average='weighted')
}

print(classification_report(y_test, y_pred_lr))

In [None]:
# 2. DECISION TREE CLASSIFIER
print("\n2. Decision Tree Classifier")
print("-"*40)
dt = DecisionTreeClassifier(random_state=42, max_depth=10)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

classification_results['Decision Tree'] = {
    'Accuracy': accuracy_score(y_test, y_pred_dt),
    'Precision': precision_score(y_test, y_pred_dt, average='weighted'),
    'Recall': recall_score(y_test, y_pred_dt, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_dt, average='weighted')
}

print(classification_report(y_test, y_pred_dt))

In [None]:
# 3. RANDOM FOREST CLASSIFIER
print("\n3. Random Forest Classifier")
print("-"*40)
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

classification_results['Random Forest'] = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf, average='weighted'),
    'Recall': recall_score(y_test, y_pred_rf, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_rf, average='weighted')
}

print(classification_report(y_test, y_pred_rf))

In [None]:
# 4. SUPPORT VECTOR MACHINE (SVM)
print("\n4. Support Vector Machine (SVM)")
print("-"*40)
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

classification_results['SVM'] = {
    'Accuracy': accuracy_score(y_test, y_pred_svm),
    'Precision': precision_score(y_test, y_pred_svm, average='weighted'),
    'Recall': recall_score(y_test, y_pred_svm, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_svm, average='weighted')
}

print(classification_report(y_test, y_pred_svm))

In [None]:
# 5. K-NEAREST NEIGHBORS (KNN)
print("\n5. K-Nearest Neighbors")
print("-"*40)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

classification_results['KNN'] = {
    'Accuracy': accuracy_score(y_test, y_pred_knn),
    'Precision': precision_score(y_test, y_pred_knn, average='weighted'),
    'Recall': recall_score(y_test, y_pred_knn, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_knn, average='weighted')
}

print(classification_report(y_test, y_pred_knn))

In [None]:
# 6. NAIVE BAYES
print("\n6. Naive Bayes Classifier")
print("-"*40)
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

classification_results['Naive Bayes'] = {
    'Accuracy': accuracy_score(y_test, y_pred_nb),
    'Precision': precision_score(y_test, y_pred_nb, average='weighted'),
    'Recall': recall_score(y_test, y_pred_nb, average='weighted'),
    'F1 Score': f1_score(y_test, y_pred_nb, average='weighted')
}

print(classification_report(y_test, y_pred_nb))

In [None]:
# ===================================================
# SECTION 6: MODEL COMPARISON
# ===================================================
print("\n" + "="*80)
print("SECTION 6: MODEL COMPARISON & RESULTS")
print("="*80)

print("\nCLASSIFICATION MODELS COMPARISON")
print("-"*80)
class_df = pd.DataFrame(classification_results).T
class_df = class_df.round(4)
print(class_df.to_string())

print("\nüèÜ BEST CLASSIFICATION MODEL:")
best_clf_model = class_df['Accuracy'].idxmax()
print(class_df[class_df['Accuracy'] == class_df['Accuracy'].max()])


In [None]:
# ===================================================
# SECTION 7: VISUALIZATIONS
# ===================================================
print("\n" + "="*80)
print("SECTION 7: GENERATING COMPARISON VISUALIZATIONS")
print("="*80)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Classification Models Performance Comparison', fontsize=16, fontweight='bold')

axes[0, 0].barh(class_df.index, class_df['Accuracy'], color='steelblue')
axes[0, 0].set_xlabel('Accuracy')
axes[0, 0].set_title('Accuracy Comparison')

axes[0, 1].barh(class_df.index, class_df['Precision'], color='coral')
axes[0, 1].set_xlabel('Precision')
axes[0, 1].set_title('Precision Comparison')

axes[1, 0].barh(class_df.index, class_df['Recall'], color='mediumseagreen')
axes[1, 0].set_xlabel('Recall')
axes[1, 0].set_title('Recall Comparison')

axes[1, 1].barh(class_df.index, class_df['F1 Score'], color='mediumpurple')
axes[1, 1].set_xlabel('F1 Score')
axes[1, 1].set_title('F1 Score Comparison')

for ax in axes.flat:
    ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úì Visualizations generated successfully!")


In [None]:
# ===================================================
# Confusion Matrices for All Classification Models
# ===================================================

from sklearn.metrics import confusion_matrix

# Get predictions from all classification models
# You need to store these predictions when training each model
classification_predictions = {
    'Logistic Regression': y_pred_lr,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'SVM': y_pred_svm,
    'KNN': y_pred_knn,
    'Naive Bayes': y_pred_nb
}

# Create figure with subplots for each model
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

# Get unique class labels
class_labels = sorted(y_test.unique())

for idx, (model_name, y_pred) in enumerate(classification_predictions.items()):
    # Calculate confusion matrix
    cm = confusion_matrix(y_test, y_pred, labels=class_labels)

    # Calculate percentages
    cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

    # Create annotations with both count and percentage
    annotations = []
    for i in range(cm.shape[0]):
        row = []
        for j in range(cm.shape[1]):
            count = cm[i, j]
            percent = cm_percent[i, j]
            row.append(f'{count}\n({percent:.1f}%)')
        annotations.append(row)

    # Plot heatmap
    sns.heatmap(cm, annot=annotations, fmt='', cmap='Blues',
                xticklabels=class_labels, yticklabels=class_labels,
                ax=axes[idx], cbar_kws={'label': 'Count'},
                linewidths=2, linecolor='white')

    # Add accuracy to title
    accuracy = classification_results[model_name]['Accuracy']
    axes[idx].set_title(f'{model_name}\nAccuracy: {accuracy:.4f}',
                       fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('Actual', fontsize=10, fontweight='bold')
    axes[idx].set_xlabel('Predicted', fontsize=10, fontweight='bold')

    # Highlight diagonal (correct predictions) with thicker border
    for i in range(len(class_labels)):
        axes[idx].add_patch(plt.Rectangle((i, i), 1, 1, fill=False,
                                         edgecolor='green', lw=3))

plt.suptitle('üéØ CONFUSION MATRICES: All Classification Models',
             fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()

print("‚úÖ Confusion Matrices for All Models Generated!")

# Print interpretation guide
print("\n" + "="*80)
print("üìä HOW TO READ CONFUSION MATRICES:")
print("="*80)
print("‚Ä¢ Diagonal (green boxes) = CORRECT predictions")
print("‚Ä¢ Off-diagonal = INCORRECT predictions")
print("‚Ä¢ Darker colors = More predictions")
print("‚Ä¢ Format: Count (Percentage of actual class)")
print("\nExample interpretation:")
print("  If cell [Drug A, Drug B] = 5 (10%)")
print("  ‚Üí 5 instances of actual Drug A were incorrectly predicted as Drug B")
print("  ‚Üí This represents 10% of all actual Drug A cases")
print("="*80)

