In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn.metrics as metrics
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
import joblib

In [2]:
data = pd.read_csv("Final_dataset.csv")
X = data.drop(columns=['target']).values
y = data['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [3]:
param_grid = {
    'n_estimators': [50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'random_state': [42]
}

random_forest_model = RandomForestClassifier(random_state=42)

grid_rf = GridSearchCV(random_forest_model, param_grid, cv=stratified_kfold, scoring='f1_macro', verbose=1)
grid_rf.fit(X_train, y_train)

print('Best Parameters:', grid_rf.best_params_)

y_pred = grid_rf.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
 
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')

model_filename = "Random_Forst_model.joblib"
joblib.dump(grid_rf.best_estimator_, model_filename)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 42}
Accuracy: 0.94
Confusion Matrix:
[[4980  267]
 [ 295 4458]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      5247
           1       0.94      0.94      0.94      4753

    accuracy                           0.94     10000
   macro avg       0.94      0.94      0.94     10000
weighted avg       0.94      0.94      0.94     10000



['Random_Forst_model.joblib']

In [4]:
param_grid = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [50, 100, 200],
}

logreg = LogisticRegression()

grid_search = GridSearchCV(logreg, param_grid, cv=stratified_kfold, scoring='f1_macro', verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
print("Best Cross-Validated Accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

y_pred = grid_search.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')
model_filename = "LogisticRegression_model.joblib"
joblib.dump(grid_search, model_filename)

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


1620 fits failed out of a total of 3600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(s

Best Parameters:  {'C': 0.001, 'fit_intercept': True, 'max_iter': 50, 'penalty': 'l1', 'solver': 'saga'}
Best Cross-Validated Accuracy: 89.04%
Accuracy: 0.89
Confusion Matrix:
[[4743  504]
 [ 570 4183]]
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.90      0.90      5247
           1       0.89      0.88      0.89      4753

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



['LogisticRegression_model.joblib']

In [5]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'random_state': [42], 
}

dt_model = tree.DecisionTreeClassifier()

grid_dt = GridSearchCV(dt_model, param_grid, cv=stratified_kfold, scoring='f1_macro', verbose=1)

grid_dt.fit(X_train, y_train)

print('Best Parameters:', grid_dt.best_params_)

y_pred = grid_dt.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{classification_rep}')
model_filename = "Decision_tree_model.joblib"
joblib.dump(grid_dt, model_filename)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


540 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\VASU\AppData\Local\Programs\Python\Python311\Lib\

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'random_state': 42, 'splitter': 'best'}
Accuracy: 0.95
Confusion Matrix:
[[5001  246]
 [ 292 4461]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      5247
           1       0.95      0.94      0.94      4753

    accuracy                           0.95     10000
   macro avg       0.95      0.95      0.95     10000
weighted avg       0.95      0.95      0.95     10000



['Decision_tree_model.joblib']

In [6]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {
    'estimator': [tree.DecisionTreeClassifier(max_depth=1), tree.DecisionTreeClassifier(max_depth=2)],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.1, 1],
    'algorithm': ['SAMME', 'SAMME.R'],
    'random_state': [42], 
}

adaboost = AdaBoostClassifier()

grid_search = GridSearchCV(estimator=adaboost, param_grid=param_grid, cv=stratified_kfold, scoring='f1_macro')

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)

best_adaboost = grid_search.best_estimator_
y_pred = best_adaboost.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
model_filename = "Adaboost_model.joblib"
joblib.dump(best_adaboost, model_filename)

Best Parameters: {'algorithm': 'SAMME.R', 'estimator': DecisionTreeClassifier(max_depth=2), 'learning_rate': 1, 'n_estimators': 200, 'random_state': 42}


['Adaboost_model.joblib']