# Overfitting

### Imports and Dataset

In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_pickle('../training_dataset_task3/task_3_training_e8da4715deef7d56_f8b7378_pandas.pkl').reset_index()

# only low and mid level features in X
X = df.loc[:, 'essentia_dissonance_mean':'mirtoolbox_roughness_pct_90']
y = df['quadrant']

### Cross Validation - hold out on segment 26, used for final testing

In [4]:
from sklearn.model_selection import cross_val_score

# add segment ID to feature space
X['segment_id'] = df['segment_id']

# pick segment 26 for final evaluation
final_test_X = X[X['segment_id']==26]
final_test_y = y[X['segment_id']==26]

# drop segment ID column
final_test_X = final_test_X.drop(['segment_id'], axis=1)



# drop segment 26 from dataset to use for final evaluation, y first
y = y.drop(X[X['segment_id']==26].index, axis=0)
X = X.drop(X[X['segment_id']==26].index, axis=0)

# drop segment ID column
X = X.drop(['segment_id'], axis=1)



# preprocess dataset
X_std = StandardScaler().fit_transform(X)
X = pd.DataFrame(X_std, columns=X.columns)

### Train Test Set Split

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [30]:
def performance(model, X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test):
    # training set
    predictiontrain = best_random.predict(X_train)
    accuracytrain = accuracy_score(y_train, predictiontrain)
    
    crossvaltrain = cross_val_score(best_random, X_train, y_train)
    print(f'Training Set:\nAccuracy: {accuracytrain}\nCrossVal: {crossvaltrain.mean()}')
    
    # test set
    predictiontest = best_random.predict(X_test)
    accuracytest = accuracy_score(y_test, predictiontest)
    
    crossvaltest = cross_val_score(best_random, X_test, y_test)
    print(f'\nTest Set:\nAccuracy: {accuracytest}\nCrossVal: {crossvaltest.mean()}')

In [31]:
# for reference, this is how the best random model performs
best_random = RandomForestClassifier(bootstrap=False, max_depth=90, min_samples_leaf=2, min_samples_split=5, n_estimators=200)
best_random.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=90, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=200)

In [32]:
performance(best_random)

Training Set:
Accuracy: 1.0
CrossVal: 0.5513109650639917

Test Set:
Accuracy: 0.5213793103448275
CrossVal: 0.5186206896551725


As expected, The accuracy for the strict train and test split calculated on the training data is 1. But for Crossval it is still around 55 percent. This is coherent with the evaluation on the test data.

## Overfitting

### high number of n_estimators

In [33]:
of_estimators = RandomForestRegressor(n_estimators=1500)
of_estimators.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1500)

In [34]:
performance(of_estimators)

Training Set:
Accuracy: 1.0
CrossVal: 0.5479188285483685

Test Set:
Accuracy: 0.5213793103448275
CrossVal: 0.5062068965517241


### high number of max_depth

In [35]:
of_depth = RandomForestRegressor(max_depth=1000)
of_depth.fit(X_train, y_train)

RandomForestRegressor(max_depth=1000)

In [36]:
performance(of_depth)

Training Set:
Accuracy: 1.0
CrossVal: 0.5492724547446096

Test Set:
Accuracy: 0.5213793103448275
CrossVal: 0.5144827586206897
