In [None]:
import numpy as np 
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

## Import Data

In [None]:
X=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X.csv")
X_train=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train.csv")
X_test=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test.csv")
X_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_selected.csv")
X_train_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_train_selected.csv")
X_test_selected=pd.read_csv("/kaggle/input/hit-prediction-processed-data/Hit Prediction/X_test_selected.csv")
y=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y.npy")
y_train=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_train.npy")
y_test=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_test.npy")
y_selected=np.load("/kaggle/input/hit-prediction-processed-data/Hit Prediction/y_selected.npy")

## Fitting Basic Model with default Parameters

In [None]:
dt=DecisionTreeClassifier(random_state=42)
dt.fit(X_train_selected,y_train)


### Test Score of Base Model

In [None]:
dt.score(X_test_selected,y_test)

### Train Score of Base Model

In [None]:
dt.score(X_train_selected,y_train)

> ### High Training Accuracy and Low Testing Accuracy clearly explains the model's less bias and high variance i.e. overfitting

## Cost Complexity Pruning 

### Extract ccp alpha values

In [None]:
path=dt.cost_complexity_pruning_path(X_train_selected,y_train)
ccp_alphas=path.ccp_alphas
ccp_alphas=ccp_alphas[:-1]

In [None]:
len(ccp_alphas)

### Construct Decision Trees for every ccp alpha value

In [None]:
clf_dts=[]
for ccp_alpha in ccp_alphas:
    clf_dt=DecisionTreeClassifier(random_state=42,ccp_alpha=ccp_alpha)
    clf_dt.fit(X_train_selected,y_train)
    clf_dts.append(clf_dt)

### Extract Train-Test Scores

In [None]:
train_scores=[clf_dt.score(X_train_selected,y_train) for clf_dt in clf_dts]
test_scores=[clf_dt.score(X_test_selected,y_test) for clf_dt in clf_dts]


### Accuracy versus Alpha Plot for Train and Test Data

In [None]:
fig,ax=plt.subplots()
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for train and test data")
ax.plot(ccp_alphas,train_scores, marker='o',label='train',drawstyle="steps-post")
ax.plot(ccp_alphas,test_scores,marker='o',label='test',drawstyle="steps-post")
ax.legend()
plt.show()

> ### The best fit is observed to be around the alpha value of ```0.001```. We will confirm its reliability by Cross Validation

## Cross Validation

In [None]:
clf_dt_cv=DecisionTreeClassifier(random_state=42,ccp_alpha=0.001)
scores=cross_val_score(clf_dt_cv,X_selected,y_selected,cv=5)
scores_df=pd.DataFrame(data={'tree':range(5),'accuracy':scores})
scores_df.plot(x='tree',y='accuracy',marker='o',linestyle='--')

In [None]:
scores

In [None]:
print("%0.2f accuracy with a standard deviation of %0.5f" % (scores.mean(), scores.std()))