In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, RocCurveDisplay

In [None]:
from sklearn.model_selection import train_test_split
from DataPreparation import DataPreparation

In [None]:
pics = './images'
data_path = './data'

In [None]:
df = pd.read_csv(f'{data_path}/balanced_dataframe.csv', index_col=None)
df.head()

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )
y_test.value_counts()

In [None]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = data_preparation.clean_dataset()

### Decision Tree

In [None]:
values = [i for i in range(1, 21)]
dtc_train_scores_b = []
dtc_test_scores_b = []
for i in values:
    dtc = DecisionTreeClassifier(max_depth=i)

    dtc.fit(X_train, y_train)

    train_yhat = dtc.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    dtc_train_scores_b.append(train_acc)
    
    test_yhat = dtc.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    dtc_test_scores_b.append(test_acc)
    
    test_auc = roc_auc_score(y_test, test_yhat)

    print('>%d, train: %.3f, test: %.3f auc: %.3f' % (i, train_acc, test_acc, test_auc))

## Plot

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, dtc_train_scores_b, '-o', label='Train')
plt.plot(values, dtc_test_scores_b, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('DecisionTreeClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/dtree_balanced.png')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

values = [i for i in range(1, 21)]
rf_train_scores_b = []
rf_test_scores_b = []
for i in values:
    rf = RandomForestClassifier(max_depth=i)

    rf.fit(X_train, y_train)

    train_yhat = rf.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    rf_train_scores_b.append(train_acc)
    
    test_yhat = rf.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    rf_test_scores_b.append(test_acc)

    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, rf_train_scores_b, '-o', label='Train')
plt.plot(values, rf_test_scores_b, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('RandomForestClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/rf_balanced.png')

In [None]:
# rfc_disp = RocCurveDisplay.from_estimator(rf, X_test, y_test, ax=ax, alpha=0.8)

## After feature selection N = 20

In [None]:
df = pd.read_csv(f'{data_path}/feat_sel_data.csv')
df

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )

In [None]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)
X_train, X_test, y_train, y_test = data_preparation.clean_dataset()

### Decision Tree

In [None]:
values = [i for i in range(1, 21)]
dtc_train_scores_a = []
dtc_test_scores_a = []
for i in values:
    dtc = DecisionTreeClassifier(max_depth=i)

    dtc.fit(X_train, y_train)

    train_yhat = dtc.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    dtc_train_scores_a.append(train_acc)
    
    test_yhat = dtc.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    dtc_test_scores_a.append(test_acc)

    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

## Plot

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, dtc_train_scores_a, '-o', label='Train')
plt.plot(values, dtc_test_scores_a, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('DecisionTreeClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/dtree_balanced_sel.png')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

values = [i for i in range(1, 21)]
rf_train_scores_a = []
rf_test_scores_a = []
for i in values:
    rf = RandomForestClassifier(max_depth=i)

    rf.fit(X_train, y_train)

    train_yhat = rf.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    rf_train_scores_a.append(train_acc)
    
    test_yhat = rf.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    rf_test_scores_a.append(test_acc)

    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, rf_train_scores_a, '-o', label='Train')
plt.plot(values, rf_test_scores_a, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('RandomForestClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/rf_balanced_sel.png')

### Comparison of accuracies for Decision Tree before and after Feature Selection

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, dtc_test_scores_b, '-o', label='Before FS')
plt.plot(values, dtc_test_scores_a, '-o', label='After FS')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('DecisionTree')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/rf_balanced_cmp.png')

### Comparison of accuracies for RF before and after Feature Selection

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, rf_test_scores_b, '-o', label='Before FS')
plt.plot(values, rf_test_scores_a, '-o', label='After FS')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('RandomForestClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
# plt.show()
plt.savefig(f'{pics}/dtree_balanced_cmp.png')