In [None]:
import ast
import numpy as np 
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.tree import DecisionTreeClassifier

## Import Data

In [None]:
X=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X.csv")
X_train=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train.csv")
X_test=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test.csv")
X_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_selected.csv")
X_train_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train_selected.csv")
X_test_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test_selected.csv")
y=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y.npy")
y_train=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_train.npy")
y_test=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_test.npy")
y_selected=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_selected.npy")

## Fitting Basic Model with default Parameters (with OOB Score)

In [None]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(X_train_selected,y_train)
rf.oob_score_

In [None]:
rf.get_params()

### Overfit Check

In [None]:
rf.score(X_train_selected,y_train)

In [None]:
rf.score(X_test_selected,y_test)

> ### This clearly indicates the overfitting of the model

## Tuning The Random Forest

## Step 1: Tune The Hyper Paramters using OOB Score

### Define Parameter Grid

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'max_features': ['sqrt', 'log2']
}
oob_scores = {}

### Simulate for every combination of Parameters

In [None]:
for params in ParameterGrid(param_grid):
    rf = RandomForestClassifier(oob_score=True, **params)
    rf.fit(X_train_selected, y_train)
    oob_scores[str(params)] = rf.oob_score_

### Best Model w.r.t OOB Score

In [None]:
best_params = max(oob_scores, key=oob_scores.get)
best_rf = RandomForestClassifier(oob_score=True, **ast.literal_eval(best_params))
best_rf.fit(X_train_selected, y_train)

### Best Model Summary

In [None]:
best_rf.oob_score_

In [None]:
best_rf.score(X_train_selected,y_train)

In [None]:
best_rf.score(X_test_selected,y_test)

> ### The model is still subject to overfitting. Hence we need to perform cost complexity pruning to prune down the tree

## Step 2: Cost Complexity Pruning 

### Define Custom CCP Simulator

In [None]:
def simulate_ccp(ccp):
    print("Cost Complexity Pruning with alpha=",ccp)
    rf=RandomForestClassifier(max_depth=20, max_features='log2', n_estimators=300,
                       oob_score=True, ccp_alpha=ccp)
    rf.fit(X_train_selected,y_train)
    print("OOB Score: ",rf.oob_score_)
    print("Train Score: ",rf.score(X_train_selected,y_train))
    print("Test Score: ",rf.score(X_test_selected,y_test), "\n")

### Simulate for ```alpha = 0.001```

In [None]:
simulate_ccp(0.001)

### Run for Increasing values of ```alpha``` from 0.001 to 0.03

In [None]:
ccp_list = [0.001,0.005,0.01,0.015,0.02,0.025,0.03]
for ccp in ccp_list:
    simulate_ccp(ccp)

> ### As ```alpha``` value increases from 0.001 all the three scores decrease. Hence lets try for decreasing values

### Run for decreasing values of ```alpha``` from 0.0001 to 0.0000001

In [None]:
ccp_list = [0.0001,0.00001,0.000001,0.0000001]
for ccp in ccp_list:
    simulate_ccp(ccp)

> ### As ```alpha``` value decreases the OOB score and test score remains stable around 0.78, but the model starts overfitting. Hence some value around 0.0001 will be the best value

## Best Model after OOB and CCP

In [None]:
simulate_ccp(0.0004)

In [None]:
rf=RandomForestClassifier(max_depth=20, max_features='log2', n_estimators=300, oob_score=True, ccp_alpha=0.0004)
rf.fit(X_train_selected,y_train)
print("OOB Score",rf.oob_score_)
print("Train Score: ",rf.score(X_train_selected,y_train))
print("Test Score: ",rf.score(X_test_selected,y_test), "\n")