In [8]:
#imports
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


In [2]:
#reading dataset
df = pd.read_csv("selectedfeatures.csv")   # Replace with your dataset
X = df.drop("num", axis=1)  # Features
y = df["num"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_trainScaled = scaler.fit_transform(X_train)
X_testScaled = scaler.transform(X_test)


In [10]:
baseline_models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

baseline_metrics = {}

for name, model in baseline_models.items():
    if name in ["Logistic Regression", "SVM"]:
        model.fit(X_trainScaled, y_train)
        y_pred = model.predict(X_testScaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    baseline_metrics[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
        "Recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
        "F1-score": f1_score(y_test, y_pred, average="weighted", zero_division=0)
    }


In [11]:
print("=== Baseline Model Metrics ===")
for name, metric in baseline_metrics.items():
    print(f"\n{name}:")
    for key, value in metric.items():
        print(f"{key}: {value:.4f}")

=== Baseline Model Metrics ===

Logistic Regression:
Accuracy: 0.6066
Precision: 0.4949
Recall: 0.6066
F1-score: 0.5370

Random Forest:
Accuracy: 0.4918
Precision: 0.4889
Recall: 0.4918
F1-score: 0.4797

SVM:
Accuracy: 0.5574
Precision: 0.4768
Recall: 0.5574
F1-score: 0.5066


Defining Parameters

In [12]:
#logistic regression
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga'],
    'max_iter': [100, 500, 1000]
}

In [13]:
#random forest
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [14]:
#svm
svm_params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

Performing search

In [15]:
#gridsearch on LR
grid_lr = GridSearchCV(
    estimator=LogisticRegression(),
    param_grid=lr_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_lr.fit(X_trainScaled, y_train)
best_lr = grid_lr.best_estimator_
print("Best Logistic Regression:", grid_lr.best_params_)

Best Logistic Regression: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}


In [16]:
#Randomized search on Random forest

rand_rf = RandomizedSearchCV(RandomForestClassifier(random_state=42), rf_params, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
rand_rf.fit(X_train, y_train)
best_rf = rand_rf.best_estimator_
print("Best Random Forest:", rand_rf.best_params_)


Best Random Forest: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5}


In [17]:
# SVM GridSearch

grid_svm = GridSearchCV(SVC(probability=True), svm_params, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_trainScaled, y_train)
best_svm = grid_svm.best_estimator_
print("Best SVM:", grid_svm.best_params_)


Best SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}


evaluatiing optimized models


In [18]:
optimized_models = {
    "Logistic Regression": best_lr,
    "Random Forest": best_rf,
    "SVM": best_svm
}
optimized_metrics = {}

for name, model in optimized_models.items():
    if name in ["Logistic Regression", "SVM"]:
        y_pred = model.predict(X_testScaled)
    else:
        y_pred = model.predict(X_test)

    optimized_metrics[name] = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred, average="weighted", zero_division=0),
        "Recall": recall_score(y_test, y_pred, average="weighted", zero_division=0),
        "F1-score": f1_score(y_test, y_pred, average="weighted", zero_division=0)
    }

In [19]:
#printing
print("\n=== Optimized Model Metrics ===")
for name, metric in optimized_metrics.items():
    print(f"\n{name}:")
    for key, value in metric.items():
        print(f"{key}: {value:.4f}")


=== Optimized Model Metrics ===

Logistic Regression:
Accuracy: 0.6066
Precision: 0.4949
Recall: 0.6066
F1-score: 0.5370

Random Forest:
Accuracy: 0.5574
Precision: 0.4781
Recall: 0.5574
F1-score: 0.5102

SVM:
Accuracy: 0.5574
Precision: 0.4768
Recall: 0.5574
F1-score: 0.5066
