# NovaGen Research Labs

This project focuses on building and evaluating multiple machine learning classification models to identify the best-performing algorithm for NovaGen Research Labs' predictive tasks

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score , classification_report , recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# Creating Baseline Model

In [2]:
df = pd.read_csv("novagen_dataset.csv")

# Split features and target
X = df.drop("Target",axis = 1)
y = df["Target"]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size = 0.2, random_state = 42, stratify=y
)

In [3]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
X_train_scaled

array([[ 0.88894392,  3.26817954,  1.78862442, ..., -0.57115259,
         1.72707689, -0.58626915],
       [ 1.21182545,  0.1721801 ,  0.09431124, ..., -0.57115259,
        -0.57901302, -0.58626915],
       [ 1.57506718, -2.40781944,  0.67109871, ..., -0.57115259,
        -0.57901302,  1.7057012 ],
       ...,
       [-0.24114145,  0.68818001,  0.45480341, ..., -0.57115259,
        -0.57901302,  1.7057012 ],
       [-0.9272647 ,  0.1721801 ,  0.67109871, ...,  1.75084561,
        -0.57901302, -0.58626915],
       [-0.03934049, -0.85981972, -0.69877152, ..., -0.57115259,
        -0.57901302,  1.7057012 ]])

In [5]:
X_test_scaled

array([[ 1.77686814,  0.68818001,  1.50023069, ..., -0.57115259,
         1.72707689, -0.58626915],
       [-1.29050643, -0.85981972,  0.74319714, ...,  1.75084561,
        -0.57901302, -0.58626915],
       [ 0.36426143, -0.85981972,  0.02221281, ..., -0.57115259,
        -0.57901302,  1.7057012 ],
       ...,
       [-0.40258221,  0.68818001,  0.20245889, ..., -0.57115259,
        -0.57901302,  1.7057012 ],
       [-1.16942585,  0.1721801 ,  1.46418147, ...,  1.75084561,
        -0.57901302, -0.58626915],
       [-0.88690451, -0.85981972,  0.95949244, ..., -0.57115259,
        -0.57901302, -0.58626915]])

In [6]:
# Lohisitic Regression (With Regulaization)
log_reg = LogisticRegression(
    penalty = "l2",
    solver = "liblinear",
    max_iter=1000
)

log_reg.fit(X_train_scaled ,y_train)
y_pred_lr = log_reg.predict(X_test_scaled)

# In Model Evaluation, Recall is more important than accuracy 
# because missing a high-risk patient is dangerous

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression Recall:", recall_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.8141361256544503
Logistic Regression Recall: 0.8283132530120482
              precision    recall  f1-score   support

           0       0.81      0.80      0.80       914
           1       0.82      0.83      0.82       996

    accuracy                           0.81      1910
   macro avg       0.81      0.81      0.81      1910
weighted avg       0.81      0.81      0.81      1910



# Model - 2 KNN

In [11]:
knn = KNeighborsClassifier(
    n_neighbors = 5,
    metric = "euclidean"
)

knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn) * 100 ,"%")
print("KNN Recall:", recall_score(y_test, y_pred_knn) * 100 ,"%")
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 88.32460732984293 %
KNN Recall: 88.35341365461848 %
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       914
           1       0.89      0.88      0.89       996

    accuracy                           0.88      1910
   macro avg       0.88      0.88      0.88      1910
weighted avg       0.88      0.88      0.88      1910



# Random Forest (Ensemble Learning)

In [8]:
rf = RandomForestClassifier(
    n_estimators = 200,
    max_depth = None,
    random_state = 42
)

rf.fit(X_train,y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy: ",accuracy_score(y_test, y_pred_rf))
print("Random Forest Recall:", recall_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy:  0.9382198952879581
Random Forest Recall: 0.9588353413654619
              precision    recall  f1-score   support

           0       0.95      0.92      0.93       914
           1       0.93      0.96      0.94       996

    accuracy                           0.94      1910
   macro avg       0.94      0.94      0.94      1910
weighted avg       0.94      0.94      0.94      1910



# Gradient Boosting (Ensemble Learning)

In [9]:
gb = GradientBoostingClassifier(
    n_estimators = 150,
    learning_rate = 0.1,
    max_depth = 3,
    random_state = 42
)

gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Gradient Boosting Recall:", recall_score(y_test, y_pred_gb))
print(classification_report(y_test, y_pred_gb))

Gradient Boosting Accuracy: 0.9303664921465968
Gradient Boosting Recall: 0.9497991967871486
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       914
           1       0.92      0.95      0.93       996

    accuracy                           0.93      1910
   macro avg       0.93      0.93      0.93      1910
weighted avg       0.93      0.93      0.93      1910



# Voting Classifier (Ensemble Lerning)

In [10]:
voting_clf = VotingClassifier(
    estimators = [
        ("lr", LogisticRegression(max_iter=1000, solver="liblinear")),
        ("knn", KNeighborsClassifier(n_neighbors=5)),
        ("rf", RandomForestClassifier(n_estimators=200, random_state=42))
    ],
    voting = "soft"
)

voting_clf.fit(X_train_scaled,y_train)
y_pred_vote = voting_clf.predict(X_test_scaled)

print("Voting Classifier Accuracy:", accuracy_score(y_test, y_pred_vote))
print("Voting Classifier Recall:", recall_score(y_test, y_pred_vote))
print(classification_report(y_test, y_pred_vote))

Voting Classifier Accuracy: 0.9157068062827225
Voting Classifier Recall: 0.929718875502008
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       914
           1       0.91      0.93      0.92       996

    accuracy                           0.92      1910
   macro avg       0.92      0.92      0.92      1910
weighted avg       0.92      0.92      0.92      1910



# Results

| Model               | Recall |
|---------------------|:------:|
| Logistic Regression |   80.8%  |
| KNN                 |   88.3%  |
| Random Forest       |   95.8%  | 
| Gredient Boosting   |   94.9%  |
| Voting Classifier   |   92.97% |

# Best Classifier that we should use for NovaGen(based on Recall) - Random Forest with accuracy of 93.7%