In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [41]:
with open('X_train_sm.pickle', mode='rb') as file:
    X_train = pickle.load(file)

with open('y_train_sm.pickle', mode='rb') as file:
    y_train = pickle.load(file)

with open('X_test.pickle', mode = 'rb') as file:
    X_test = pickle.load(file)

with open('y_test.pickle', mode = 'rb') as file:
    y_test = pickle.load(file)

In [42]:
from sklearn.tree import DecisionTreeClassifier

In [43]:
dt_clf = DecisionTreeClassifier(random_state=7, max_depth = 4)

In [44]:
dt_clf.fit(X_train, y_train)

In [45]:
print(f'Train accuracy : {round(dt_clf.score(X_train, y_train)*100, 2)}')
print(f'Train accuracy : {round(dt_clf.score(X_test, y_test)*100, 2)}')

Train accuracy : 81.02
Train accuracy : 73.13


# RANDOM FORSEST:-

In [46]:
from sklearn.ensemble import RandomForestClassifier

In [47]:
rf_clf = RandomForestClassifier(random_state=7, max_depth=4, n_estimators=100)
rf_clf.fit(X_train, y_train)
print(f"Train Accuracy: {round(rf_clf.score(X_train, y_train)*100,2)}")
print(f"Test Accuracy: {round(rf_clf.score(X_test, y_test)*100,2)}")

Train Accuracy: 88.21
Test Accuracy: 82.99


### WITH CROSS VALIDATION:-

In [48]:
rf_clf2 = RandomForestClassifier(random_state = 7, max_depth=4, n_estimators=100)

In [49]:
from sklearn.model_selection import KFold, cross_validate

In [50]:
kfold = KFold(n_splits=10)
cv_acc_results = cross_validate(rf_clf2, X_train, y_train, cv = kfold, scoring='accuracy', return_train_score = True)
cv_acc_results

{'fit_time': array([0.19560075, 0.19152761, 0.19943094, 0.19750881, 0.20146322,
        0.20146108, 0.19747186, 0.20049953, 0.18945599, 0.19647288]),
 'score_time': array([0.00598407, 0.00598216, 0.00798035, 0.0069809 , 0.00698137,
        0.00598359, 0.00698161, 0.00598049, 0.0069809 , 0.00698113]),
 'test_score': array([0.78282828, 0.76262626, 0.74242424, 0.81313131, 0.82828283,
        0.73737374, 0.89847716, 0.85786802, 0.85786802, 0.92893401]),
 'train_score': array([0.87064117, 0.87851519, 0.87964004, 0.87739033, 0.88188976,
        0.87007874, 0.88026981, 0.88251827, 0.87858347, 0.88139404])}

In [51]:
print('K-fold Accuracy Mean :-')
print(f'Train : {cv_acc_results['train_score'].mean()*100}')
print(f'Validation : {cv_acc_results['test_score'].mean()*100}')

K-fold Accuracy Mean :-
Train : 87.80920829247103
Validation : 82.09813874788495


## GRID SEARCH:-

In [52]:
from sklearn.model_selection import GridSearchCV

In [53]:
params = {
          'n_estimators' : [100,200,300,400],
          'max_depth' : [3,5,10],
          'criterion' : ['gini', 'entropy'],
          'bootstrap' : [True, False],
          'max_features' : [8,9,10] 
}

In [54]:
grid = GridSearchCV(estimator = RandomForestClassifier(),
                   param_grid = params,
                   cv = 3,
                   n_jobs = -1,
                   verbose = 10
                   )

In [None]:
grid.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [None]:
print('Best Params : ', grid.best_params_)
print('Best Score : ', grid.best_score_)

In [None]:
clf2 = RandomForestClassifier(random_state = 7, bootstrap = False, criterion = 'gini', max_depth = 10, max_features = 10, n_estimators = 400)

kfold = KFold(n_splits = 10)
cv_acc_results = cross_validate(clf2, X_train, y_train, cv = kfold, scoring = 'accuracy', return_train_score= True)
cv_acc_results

In [None]:
print('K-Fold Accuracy Mean')
print(f'Train : {cv_acc_results['train_score'].mean()*100}')
print(f'Validation : {cv_acc_results['test_score'].mean()*100}')

In [None]:
clf2.fit(X_train,y_train)

In [None]:
print(f'Train Accuracy : {round(clf2.score(X_train, y_train)*100,2)}')
print(f'Test Accuracy : {round(clf2.score(X_test, y_test)*100,2)}')

## RANDOMIZED SEARCH CV:-

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from random import randint

In [None]:
params = {'ccp_alpha': uniform(loc=0, scale=0.4),
         'n_estimators': [100,150,200,250,300,350,400,450,500,550],
         'bootstrap': [True, False],
         'max_depth': [4,5,6,7,8,9,10]
         }

In [None]:
random = RandomizedSearchCV(estimator = RandomForestClassifier(random_state=7),
                            param_distributions = params,
                            scoring = 'accuracy',
                            cv = 3,
                            n_iter=15,
                            n_jobs=-1,
                            verbose=10
                            )

In [None]:
random.fit(X_train, y_train)

In [None]:
print('Best Params', random.best_params_)
print('Best Score', random.best_score_)

In [None]:
clf3 = RandomForestClassifier(random_state = 7,
                              bootstrap = True,
                              ccp_alpha = 0.0006210960530503851,
                              max_depth = 8,
                              n_estimators = 350
)

kfold = KFold(n_splits=10)
cv_acc_results = cross_validate(clf3, X_train, y_train, cv = kfold, scoring = 'accuracy', return_train_score = True)
cv_acc_results

In [None]:
print("K-Fold Accuracy Mean:-")
print(f'Train : {cv_acc_results['train_score'].mean()*100}')
print(f'Validation : {cv_acc_results['test_score'].mean()*100}')

In [None]:
clf3.fit(X_train, y_train)

In [None]:
print(f"Train Accuracy: {round(clf3.score(X_train, y_train)*100,2)}")
print(f"Test Accuracy: {round(clf3.score(X_test, y_test)*100,2)}")

#### Feature Importance:-

In [None]:
importances = clf3.feature_importances_

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]

# Rearrange feature names so they match the sorted feature importances
names = [X_train.columns[i] for i in indices]

In [None]:
plt.figure(figsize=(15, 7))
plt.bar(range(X_train.shape[1]), importances[indices])
plt.title("Feature Importance")
plt.xticks(range(X_train.shape[1]), names, rotation=90)
plt.show()