In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Import Dataset

In [None]:
X_train=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train_selected.csv")
X_test=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test_selected.csv")
y_train=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_train.npy")
y_test=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_test.npy")

## Fitting Best Estimators (Logistic, SVM, DT, RF, KNN, Naive Bayes)

In [None]:
lr = LogisticRegression(C=0.1, max_iter=10000, random_state=42)
lr.fit(X_train, y_train)

knn = KNeighborsClassifier(algorithm='ball_tree',n_neighbors=19,p=1,weights='distance')
knn.fit(X_train, y_train)

nb = GaussianNB(priors=None, var_smoothing=1e-09)
nb.fit(X_train, y_train)

dt = DecisionTreeClassifier(random_state=42,ccp_alpha=0.001)
dt.fit(X_train, y_train)

rf = RandomForestClassifier(max_depth=20, max_features='log2', n_estimators=300, oob_score=True, ccp_alpha=0.0004)
rf.fit(X_train, y_train)

In [None]:
svm = SVC(C=1.5, class_weight='balanced', gamma='scale', kernel='rbf',probability=True)
svm.fit(X_train, y_train)

## Predict Class Probabilities and Estimate TPR, FPR and AUC

In [None]:
# Predict the class probabilities for the validation set for each model
lr_prob = lr.predict_proba(X_test)[:, 1]
knn_prob = knn.predict_proba(X_test)[:, 1]
nb_prob = nb.predict_proba(X_test)[:, 1]
svm_prob = svm.predict_proba(X_test)[:, 1]
dt_prob = dt.predict_proba(X_test)[:, 1]
rf_prob = rf.predict_proba(X_test)[:, 1]

# Compute the FPR, TPR, and AUC for each model
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_prob)
lr_auc = roc_auc_score(y_test, lr_prob)

knn_fpr, knn_tpr, _ = roc_curve(y_test, knn_prob)
knn_auc = roc_auc_score(y_test, knn_prob)

nb_fpr, nb_tpr, _ = roc_curve(y_test, nb_prob)
nb_auc = roc_auc_score(y_test, nb_prob)

svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_prob)
svm_auc = roc_auc_score(y_test, svm_prob)

dt_fpr, dt_tpr, _ = roc_curve(y_test, dt_prob)
dt_auc = roc_auc_score(y_test, dt_prob)

rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_prob)
rf_auc = roc_auc_score(y_test, rf_prob)

## Plot ROC graph and corresponding AUC

In [None]:
# Plot the ROC curves for each model
plt.plot(lr_fpr, lr_tpr, label=f'Logistic Regression (AUC = {lr_auc:.2f})')
plt.plot(knn_fpr, knn_tpr, label=f'KNN (AUC = {knn_auc:.2f})')
plt.plot(nb_fpr, nb_tpr, label=f'Naive Bayes (AUC = {nb_auc:.2f})')
plt.plot(svm_fpr, svm_tpr, label=f'SVM (AUC = {svm_auc:.2f})')
plt.plot(dt_fpr, dt_tpr, label=f'Decision Tree (AUC = {dt_auc:.2f})')
plt.plot(rf_fpr, rf_tpr, label=f'Random Forest (AUC = {rf_auc:.2f})')

# Set the title and axis labels for the plot
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')

# Add a legend to the plot
plt.legend()

# Highlight the ROC curve for the best model
best_auc = max(lr_auc, knn_auc, nb_auc, svm_auc, dt_auc, rf_auc)

if best_auc == lr_auc:
    plt.plot(lr_fpr, lr_tpr, linewidth=2, linestyle='--', color='green')
elif best_auc == knn_auc:
    plt.plot(knn_fpr, knn_tpr, linewidth=2, linestyle='--', color='green')
elif best_auc == nb_auc:
    plt.plot(nb_fpr, nb_tpr, linewidth=2, linestyle='--', color='green')
elif best_auc == svm_auc:
    plt.plot(svm_fpr, svm_tpr, linewidth=2, linestyle='--', color='green')
elif best_auc == dt_auc:
    plt.plot(dt_fpr, dt_tpr, linewidth=2, linestyle='--', color='green')
elif best_auc == rf_auc:
    plt.plot(rf_fpr, rf_tpr, linewidth=2, linestyle='--', color='green')

# Show the plot
plt.show()


## Inference 
> ### Support Vector Machine and Random Forest perform equally well and better than the rest of the models with an AUC of 0.85. They are good in discriminating the True Positives and False Positives. 