### This notebook includes fitting the pre-processed data from the Notebook01, applying different models to identify the best performing model and the discussion and analysis of the models.

In [14]:
# To Ignore warnings from scikit-learn
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore")
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [1]:
# Importing necessary libraries
import xgboost as xgb
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np

#### Loading the pre-processed data from the Notebook01.

In [16]:
X_train = pd.read_csv('X_train.csv')
y_train = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')
y_test = pd.read_csv('y_test.csv')

### Fitting a Decision Tree Model

In [17]:
# Initialize and train the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [18]:
y_pred_decision = clf.predict(X_test)

### Measuring the performance of Decision-Tree model

In [19]:
# Accuracy
accuracy_decision = accuracy_score(y_test, y_pred_decision)
# Confusion Matrix
conf_matrix_decision = confusion_matrix(y_test, y_pred_decision)
# Classification Report
class_report_decision = classification_report(y_test, y_pred_decision)

print(f'Decision Tree Accuracy: {accuracy_decision}')
print(f'Decision Tree Confusion Matrix: \n{conf_matrix_decision}')
print(f'Decision Tree Classification Report: \n{class_report_decision}')

Decision Tree Accuracy: 0.9023394394173472
Decision Tree Confusion Matrix: 
[[7571  465]
 [ 420  606]]
Decision Tree Classification Report: 
              precision    recall  f1-score   support

          no       0.95      0.94      0.94      8036
         yes       0.57      0.59      0.58      1026

    accuracy                           0.90      9062
   macro avg       0.76      0.77      0.76      9062
weighted avg       0.90      0.90      0.90      9062



#### Applying Random Search for Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the Decision Tree model
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the hyperparameter grid for Randomized Search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': np.arange(1, 20),
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11)
}

# Create the Randomized Search CV object
random_search = RandomizedSearchCV(
    dt_classifier, 
    param_distributions=param_grid, 
    n_iter=100,  # Number of random combinations to try
    scoring='accuracy',  # You can use different scoring metrics
    cv=5,  # Number of cross-validation folds
    random_state=42,
    n_jobs=-1,  # Use all available CPU cores
    verbose=1  # Show progress during the search
)

# Fit the Randomized Search on the training data
random_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Evaluate the best model on the test data
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f'Best Hyperparameters: {best_params}')
print(f'Best Model Accuracy: {accuracy_best}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'splitter': 'best', 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_depth': 7, 'criterion': 'entropy'}
Best Model Accuracy: 0.9146987419995586


#### Applying Grid Search for Decision Tree

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the Decision Tree model
dt_classifier = DecisionTreeClassifier(random_state=42)

# Define the hyperparameter grid for Grid Search
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None] + list(range(1, 20)),  # Include None for no maximum depth
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
}

# Create the Grid Search CV object
grid_search = GridSearchCV(
    dt_classifier, 
    param_grid=param_grid,
    scoring='accuracy',  # You can use different scoring metrics
    cv=5,  # Number of cross-validation folds
    n_jobs=-1,  # Use all available CPU cores
    verbose=1  # Show progress during the search
)

# Fit the Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test data
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f'Best Hyperparameters: {best_params}')
print(f'Best Model Accuracy: {accuracy_best}')

Fitting 5 folds for each of 7200 candidates, totalling 36000 fits
Best Hyperparameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 10, 'min_samples_split': 2, 'splitter': 'best'}
Best Model Accuracy: 0.9153608474950342
