In [44]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, LabelEncoder



train_features = pd.read_csv("train.csv")
train_labels = pd.read_csv("train_labels.csv")
val_features = pd.read_csv("validation.csv")
val_labels = pd.read_csv("validation_labels.csv")
test_features = pd.read_csv('test.csv')
test_labels = pd.read_csv('test_labels.csv')


le = LabelEncoder()

# Encode categorical columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_cols = train_features.select_dtypes(include=['object']).columns
train_features[categorical_cols] = encoder.fit_transform(train_features[categorical_cols])
val_features[categorical_cols] = encoder.transform(val_features[categorical_cols])
test_features[categorical_cols] = encoder.transform(test_features[categorical_cols])

# Encode labels
le = LabelEncoder()
train_labels = le.fit_transform(train_labels.squeeze())
val_labels = le.transform(val_labels.squeeze())
test_labels = le.transform(test_labels.squeeze())


In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, classification_report, confusion_matrix
import numpy as np

# Define StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Define hyperparameters to try (including Gini and Entropy for criterion)
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define a custom scorer using accuracy
scorer = make_scorer(accuracy_score)

# Define the GridSearchCV for Decision Tree
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=cv, scoring=scorer, n_jobs=-1)

# Train the model using GridSearchCV
grid_search.fit(train_features, train_labels)

# Display best model and scores
best_model = grid_search.best_estimator_
print("Best model:", grid_search.best_params_)
print("Best training score:", grid_search.best_score_)

# Predictions on the training and validation sets
train_preds = best_model.predict(train_features)
val_preds = best_model.predict(val_features)

# Display classification reports
print("Training classification report:")
print(classification_report(train_labels, train_preds))

print("Validation classification report:")
print(classification_report(val_labels, val_preds))

# Confusion matrices
print("Training confusion matrix:")
print(confusion_matrix(train_labels, train_preds))

print("Validation confusion matrix:")
print(confusion_matrix(val_labels, val_preds))


Best model: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Best training score: 0.7776876267748478
Training classification report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      3622
           1       0.78      0.69      0.73      1308

    accuracy                           0.87      4930
   macro avg       0.83      0.81      0.82      4930
weighted avg       0.86      0.87      0.86      4930

Validation classification report:
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       776
           1       0.49      0.47      0.48       280

    accuracy                           0.73      1056
   macro avg       0.65      0.64      0.65      1056
weighted avg       0.72      0.73      0.73      1056

Training confusion matrix:
[[3363  259]
 [ 405  903]]
Validation confusion matrix:
[[637 139]
 [149 131]]


In [46]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define a Pipeline for SVM with standardization
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardization
    ('svc', SVC(class_weight='balanced'))
])

# Define hyperparameters for the SVM
param_grid_svm = {
    'svc__kernel': ['linear', 'rbf', 'poly'],
    'svc__C': [0.1, 1, 10],
    'svc__gamma': ['scale', 'auto', 0.1],
    'svc__degree': [3, 4, 5]
}

# Define GridSearchCV for SVM
grid_search_svm = GridSearchCV(pipeline, param_grid_svm, cv=cv, scoring=scorer, n_jobs=-1)

# Train the SVM model using GridSearchCV
grid_search_svm.fit(train_features, train_labels)

# Display best model and scores for SVM
best_model_svm = grid_search_svm.best_estimator_
print("Best SVM model:", grid_search_svm.best_params_)
print("Best SVM training score:", grid_search_svm.best_score_)

# Predictions on the training and validation sets for SVM
train_preds_svm = best_model_svm.predict(train_features)
val_preds_svm = best_model_svm.predict(val_features)

# Display classification reports for SVM
print("SVM Training classification report:")
print(classification_report(train_labels, train_preds_svm))

print("SVM Validation classification report:")
print(classification_report(val_labels, val_preds_svm))

# Confusion matrices for SVM
print("SVM Training confusion matrix:")
print(confusion_matrix(train_labels, train_preds_svm))

print("SVM Validation confusion matrix:")
print(confusion_matrix(val_labels, val_preds_svm))


Best SVM model: {'svc__C': 0.1, 'svc__degree': 3, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
Best SVM training score: 0.7683569979716024
SVM Training classification report:
              precision    recall  f1-score   support

           0       0.90      0.80      0.85      3622
           1       0.58      0.76      0.66      1308

    accuracy                           0.79      4930
   macro avg       0.74      0.78      0.75      4930
weighted avg       0.82      0.79      0.80      4930

SVM Validation classification report:
              precision    recall  f1-score   support

           0       0.87      0.81      0.84       776
           1       0.56      0.68      0.61       280

    accuracy                           0.77      1056
   macro avg       0.72      0.74      0.73      1056
weighted avg       0.79      0.77      0.78      1056

SVM Training confusion matrix:
[[2904  718]
 [ 311  997]]
SVM Validation confusion matrix:
[[627 149]
 [ 91 189]]


In [47]:
import pickle

# Save the best model (Decision Tree or SVM)
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_model, model_file)


In [48]:
# Load the model from the pickle file
with open('best_model.pkl', 'rb') as model_file:
    final_model = pickle.load(model_file)

# Make predictions on the test data
test_preds = final_model.predict(test_features)

# Display classification report and confusion matrix for the final model
print("Final Model Test classification report:")
print(classification_report(test_labels, test_preds))

print("Final Model Test confusion matrix:")
print(confusion_matrix(test_labels, test_preds))






Final Model Test classification report:
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       776
           1       0.53      0.44      0.48       281

    accuracy                           0.75      1057
   macro avg       0.67      0.65      0.66      1057
weighted avg       0.74      0.75      0.74      1057

Final Model Test confusion matrix:
[[666 110]
 [156 125]]
