In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.model_selection import train_test_split
from DataPreparation import DataPreparation

In [None]:
pics = './images'

In [None]:
df = pd.read_csv('./data/balanced_dataframe.csv', index_col=None)
df.head()

In [None]:
X = df.drop(columns=['label'])
y = df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42,
                                                    stratify=y
                                                    )
y_test.value_counts()

In [None]:
data_preparation = DataPreparation(X_train, X_test, y_train, y_test)

# # get scaled clean data
# # use StandartScaler
X_train, X_test, y_train, y_test = data_preparation.clean_dataset()

### Decision Tree

In [None]:
values = [i for i in range(1, 21)]
dtc_train_scores = []
dtc_test_scores = []
for i in values:
    dtc = DecisionTreeClassifier(max_depth=i)

    dtc.fit(X_train, y_train)

    train_yhat = dtc.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    dtc_train_scores.append(train_acc)
    
    test_yhat = dtc.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    dtc_test_scores.append(test_acc)

    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

## Plot

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, dtc_train_scores, '-o', label='Train')
plt.plot(values, dtc_test_scores, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('DecisionTreeClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
plt.show()
# plt.savefig(f'{pics}/dtree_balanced.png')

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

values = [i for i in range(1, 21)]
rf_train_scores = []
rf_test_scores = []
for i in values:
    rf = RandomForestClassifier(max_depth=i)

    rf.fit(X_train, y_train)

    train_yhat = rf.predict(X_train)
    train_acc = accuracy_score(y_train, train_yhat)
    rf_train_scores.append(train_acc)
    
    test_yhat = rf.predict(X_test)
    test_acc = accuracy_score(y_test, test_yhat)
    rf_test_scores.append(test_acc)

    print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

In [None]:
plt.figure(1, figsize=(8,8))
plt.plot(values, rf_train_scores, '-o', label='Train')
plt.plot(values, rf_test_scores, '-o', label='Test')
plt.legend()
plt.xticks(range(1, 21))
plt.grid()
plt.title('RandomForestClassifier')
plt.ylabel('Accuracy')
plt.xlabel('Max depth')
plt.show()
# plt.savefig(f'{pics}/rf_balanced.png')