In [35]:
# Necessary imports
import xgboost as xgb
import pandas as pd

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from utils.load_dataset import load_dataset
from utils.split_x_y import split_x_y
from utils.printmd import printmd

In [2]:
# Load datasets

# 1% class balance
train_one_random_under = load_dataset(dataset_type='train', balance=1, technique='ru')
train_one_near_miss_under = load_dataset(dataset_type='train', balance=1, technique='nmu')
train_one_random_over = load_dataset(dataset_type='train', balance=1, technique='ro')

# 3% class balance
train_three_random_under = load_dataset(dataset_type='train', balance=3, technique='ru')
train_three_near_miss_under = load_dataset(dataset_type='train', balance=3, technique='nmu')
train_three_random_over = load_dataset(dataset_type='train', balance=3, technique='ro')

# 5% class balance
train_five_random_under = load_dataset(dataset_type='train', balance=5, technique='ru')
train_five_near_miss_under = load_dataset(dataset_type='train', balance=5, technique='nmu')
train_five_random_over = load_dataset(dataset_type='train', balance=5, technique='ro')

# 25% class balance
train_twenty_five_random_under = load_dataset(dataset_type='train', balance=25, technique='ru')
train_twenty_five_near_miss_under = load_dataset(dataset_type='train', balance=25, technique='nmu')
train_twenty_five_random_over = load_dataset(dataset_type='train', balance=25, technique='ro')

# 50% class balance
train_fifty_random_under = load_dataset(dataset_type='train', balance=50, technique='ru')
train_fifty_near_miss_under = load_dataset(dataset_type='train', balance=50, technique='nmu')
train_fifty_random_over = load_dataset(dataset_type='train', balance=50, technique='ro')

# Test
test = load_dataset(dataset_type='test')

In [3]:
# Split datasets into train and validation

# 1% class balance
train_one_ru, validation_one_ru = train_test_split(train_one_random_under, test_size=0.2)
train_one_nmu, validation_one_nmu = train_test_split(train_one_near_miss_under, test_size=0.2)
train_one_ro, validation_one_ro = train_test_split(train_one_random_over, test_size=0.2)

# 3% class balance
train_three_ru, validation_three_ru = train_test_split(train_three_random_under, test_size=0.2)
train_three_nmu, validation_three_nmu = train_test_split(train_three_near_miss_under, test_size=0.2)
train_three_ro, validation_three_ro = train_test_split(train_three_random_over, test_size=0.2)

# 5% class balance
train_five_ru, validation_five_ru = train_test_split(train_five_random_under, test_size=0.2)
train_five_nmu, validation_five_nmu = train_test_split(train_five_near_miss_under, test_size=0.2)
train_five_ro, validation_five_ro = train_test_split(train_five_random_over, test_size=0.2)

# 25% class balance
train_twenty_five_ru, validation_twenty_five_ru = train_test_split(train_twenty_five_random_under, test_size=0.2)
train_twenty_five_nmu, validation_twenty_five_nmu = train_test_split(train_twenty_five_near_miss_under, test_size=0.2)
train_twenty_five_ro, validation_twenty_five_ro = train_test_split(train_twenty_five_random_over, test_size=0.2)

# 50% class balance
train_fifty_ru, validation_fifty_ru = train_test_split(train_fifty_random_under, test_size=0.2)
train_fifty_nmu, validation_fifty_nmu = train_test_split(train_fifty_near_miss_under, test_size=0.2)
train_fifty_ro, validation_fifty_ro = train_test_split(train_fifty_random_over, test_size=0.2)

In [4]:
# Split datasets into features and target

# 1% class balance
train_one_ru_X, train_one_ru_y = split_x_y(df=train_one_ru, target='returnShipment')
validation_one_ru_X, validation_one_ru_y = split_x_y(df=validation_one_ru, target='returnShipment')

train_one_nmu_X, train_one_nmu_y = split_x_y(df=train_one_nmu, target='returnShipment')
validation_one_nmu_X, validation_one_nmu_y = split_x_y(df=validation_one_nmu, target='returnShipment')

train_one_ro_X, train_one_ro_y = split_x_y(df=train_one_ro, target='returnShipment')
validation_one_ro_X, validation_one_ro_y = split_x_y(df=validation_one_ro, target='returnShipment')

# 3% class balance
train_three_ru_X, train_three_ru_y = split_x_y(df=train_three_ru, target='returnShipment')
validation_three_ru_X, validation_three_ru_y = split_x_y(df=validation_three_ru, target='returnShipment')

train_three_nmu_X, train_three_nmu_y = split_x_y(df=train_three_nmu, target='returnShipment')
validation_three_nmu_X, validation_three_nmu_y = split_x_y(df=validation_three_nmu, target='returnShipment')

train_three_ro_X, train_three_ro_y = split_x_y(df=train_three_ro, target='returnShipment')
validation_three_ro_X, validation_three_ro_y = split_x_y(df=validation_three_ro, target='returnShipment')

# 5% class balance
train_five_ru_X, train_five_ru_y = split_x_y(df=train_five_ru, target='returnShipment')
validation_five_ru_X, validation_five_ru_y = split_x_y(df=validation_five_ru, target='returnShipment')

train_five_nmu_X, train_five_nmu_y = split_x_y(df=train_five_nmu, target='returnShipment')
validation_five_nmu_X, validation_five_nmu_y = split_x_y(df=validation_five_nmu, target='returnShipment')

train_five_ro_X, train_five_ro_y = split_x_y(df=train_five_ro, target='returnShipment')
validation_five_ro_X, validation_five_ro_y = split_x_y(df=validation_five_ro, target='returnShipment')

# 25% class balance
train_twenty_five_ru_X, train_twenty_five_ru_y = split_x_y(df=train_twenty_five_ru, target='returnShipment')
validation_twenty_five_ru_X, validation_twenty_five_ru_y = split_x_y(df=validation_twenty_five_ru, target='returnShipment')

train_twenty_five_nmu_X, train_twenty_five_nmu_y = split_x_y(df=train_twenty_five_nmu, target='returnShipment')
validation_twenty_five_nmu_X, validation_twenty_five_nmu_y = split_x_y(df=validation_five_nmu, target='returnShipment')

train_twenty_five_ro_X, train_twenty_five_ro_y = split_x_y(df=train_twenty_five_ro, target='returnShipment')
validation_twenty_five_ro_X, validation_twenty_five_ro_y = split_x_y(df=validation_twenty_five_ro, target='returnShipment')

# 50% class balance
train_fifty_ru_X, train_fifty_ru_y = split_x_y(df=train_fifty_ru, target='returnShipment')
validation_fifty_ru_X, validation_fifty_ru_y = split_x_y(df=validation_fifty_ru, target='returnShipment')

train_fifty_nmu_X, train_fifty_nmu_y = split_x_y(df=train_fifty_nmu, target='returnShipment')
validation_fifty_nmu_X, validation_fifty_nmu_y = split_x_y(df=validation_fifty_nmu, target='returnShipment')

train_fifty_ro_X, train_fifty_ro_y = split_x_y(df=train_fifty_ro, target='returnShipment')
validation_fifty_ro_X, validation_fifty_ro_y = split_x_y(df=validation_fifty_ro, target='returnShipment')

# Test
test_X, test_y = split_x_y(df=test, target='returnShipment')

In [5]:
# XGBoost classification

In [16]:
xgb_cl_one_ru = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_one_ru.fit(train_one_ru_X, train_one_ru_y)
printmd('**1% class balance random under**')
preds_validation = xgb_cl_one_ru.predict(validation_one_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_ru_y, preds_validation), 2)}')
preds_test = xgb_cl_one_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance random under**

Accuracy validation: 0.62
Accuracy test: 0.6


In [18]:
xgb_cl_one_nmu = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_one_nmu.fit(train_one_nmu_X, train_one_nmu_y)
printmd('**1% class balance near miss under**')
preds_validation = xgb_cl_one_nmu.predict(validation_one_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_nmu_y, preds_validation), 2)}')
preds_test = xgb_cl_one_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance near miss under**

Accuracy validation: 0.86
Accuracy test: 0.51


In [19]:
xgb_cl_one_ro = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_one_ro.fit(train_one_ro_X, train_one_ro_y)
printmd('**1% class balance random over**')
preds_validation = xgb_cl_one_ro.predict(validation_one_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_ro_y, preds_validation), 2)}')
preds_test = xgb_cl_one_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance random over**

Accuracy validation: 0.88
Accuracy test: 0.54


In [20]:
xgb_cl_three_ru = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_three_ru.fit(train_three_ru_X, train_three_ru_y)
printmd('**3% class balance random under**')
preds_validation = xgb_cl_three_ru.predict(validation_three_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_ru_y, preds_validation), 2)}')
preds_test = xgb_cl_three_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance random under**

Accuracy validation: 0.62
Accuracy test: 0.62


In [21]:
xgb_cl_three_nmu = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_three_nmu.fit(train_three_nmu_X, train_three_nmu_y)
printmd('**3% class balance near miss under**')
preds_validation = xgb_cl_three_nmu.predict(validation_three_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_nmu_y, preds_validation), 2)}')
preds_test = xgb_cl_three_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance near miss under**

Accuracy validation: 0.82
Accuracy test: 0.53


In [22]:
xgb_cl_three_ro = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_three_ro.fit(train_three_ro_X, train_three_ro_y)
printmd('**3% class balance random over**')
preds_validation = xgb_cl_three_ro.predict(validation_three_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_ro_y, preds_validation), 2)}')
preds_test = xgb_cl_three_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance random over**

Accuracy validation: 0.78
Accuracy test: 0.57


In [23]:
xgb_cl_five_ru = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_five_ru.fit(train_five_ru_X, train_five_ru_y)
printmd('**5% class balance random under**')
preds_validation = xgb_cl_five_ru.predict(validation_five_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_ru_y, preds_validation), 2)}')
preds_test = xgb_cl_five_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance random under**

Accuracy validation: 0.63
Accuracy test: 0.63


In [24]:
xgb_cl_five_nmu = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_five_nmu.fit(train_five_nmu_X, train_five_nmu_y)
printmd('**5% class balance near miss under**')
preds_validation = xgb_cl_five_nmu.predict(validation_five_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_nmu_y, preds_validation), 2)}')
preds_test = xgb_cl_five_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance near miss under**

Accuracy validation: 0.81
Accuracy test: 0.53


In [26]:
xgb_cl_five_ro = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_five_ro.fit(train_five_ro_X, train_five_ro_y)
printmd('**5% class balance random over**')
preds_validation = xgb_cl_five_ro.predict(validation_five_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_ro_y, preds_validation), 2)}')
preds_test = xgb_cl_five_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance random over**

Accuracy validation: 0.74
Accuracy test: 0.6


In [28]:
xgb_cl_twenty_five_ru = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_twenty_five_ru.fit(train_twenty_five_ru_X, train_twenty_five_ru_y)
printmd('**25% class balance random under**')
preds_validation = xgb_cl_twenty_five_ru.predict(validation_twenty_five_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_ru_y, preds_validation), 2)}')
preds_test = xgb_cl_twenty_five_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance random under**

Accuracy validation: 0.65
Accuracy test: 0.64


In [48]:
xgb_cl_twenty_five_nmu = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_twenty_five_nmu.fit(train_twenty_five_nmu_X, train_twenty_five_nmu_y)
printmd('**25% class balance near miss under**')
preds_validation = xgb_cl_twenty_five_nmu.predict(validation_twenty_five_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_nmu_y, preds_validation), 2)}')
preds_test = xgb_cl_twenty_five_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance near miss under**

Accuracy validation: 0.8
Accuracy test: 0.58


In [31]:
xgb_cl_twenty_five_ro = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_twenty_five_ro.fit(train_twenty_five_ro_X, train_twenty_five_ro_y)
printmd('**25% class balance random over**')
preds_validation = xgb_cl_twenty_five_ro.predict(validation_twenty_five_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_ro_y, preds_validation), 2)}')
preds_test = xgb_cl_twenty_five_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance random over**

Accuracy validation: 0.67
Accuracy test: 0.64


In [32]:
xgb_cl_fifty_ru = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_fifty_ru.fit(train_fifty_ru_X, train_fifty_ru_y)
printmd('**50% class balance random under**')
preds_validation = xgb_cl_fifty_ru.predict(validation_fifty_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_ru_y, preds_validation), 2)}')
preds_test = xgb_cl_fifty_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance random under**

Accuracy validation: 0.65
Accuracy test: 0.65


In [33]:
xgb_cl_fifty_nmu = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_fifty_nmu.fit(train_fifty_nmu_X, train_fifty_nmu_y)
printmd('**50% class balance near miss under**')
preds_validation = xgb_cl_fifty_nmu.predict(validation_fifty_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_nmu_y, preds_validation), 2)}')
preds_test = xgb_cl_fifty_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance near miss under**

Accuracy validation: 0.65
Accuracy test: 0.64


In [34]:
xgb_cl_fifty_ro = xgb.XGBClassifier(eval_metric='error', use_label_encoder=False)
xgb_cl_fifty_ro.fit(train_fifty_ro_X, train_fifty_ro_y)
printmd('**50% class balance random over**')
preds_validation = xgb_cl_fifty_ro.predict(validation_fifty_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_ro_y, preds_validation), 2)}')
preds_test = xgb_cl_fifty_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance random over**

Accuracy validation: 0.65
Accuracy test: 0.64


In [None]:
# Random forest classification

In [51]:
rf_cl_one_ru = RandomForestClassifier()
rf_cl_one_ru.fit(train_one_ru_X, train_one_ru_y.to_numpy().flatten())
printmd('**1% class balance random under**')
preds_validation = rf_cl_one_ru.predict(validation_one_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_ru_y, preds_validation), 2)}')
preds_test = rf_cl_one_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance random under**

Accuracy validation: 0.62
Accuracy test: 0.62


In [49]:
rf_cl_one_nmu = RandomForestClassifier()
rf_cl_one_nmu.fit(train_one_nmu_X, train_one_nmu_y.to_numpy().flatten())
printmd('**1% class balance near miss under**')
preds_validation = rf_cl_one_nmu.predict(validation_one_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_nmu_y, preds_validation), 2)}')
preds_test = rf_cl_one_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance near miss under**

Accuracy validation: 0.86
Accuracy test: 0.5


In [50]:
rf_cl_one_ro = RandomForestClassifier()
rf_cl_one_ro.fit(train_one_ro_X, train_one_ro_y.to_numpy().flatten())
printmd('**1% class balance random over**')
preds_validation = rf_cl_one_ro.predict(validation_one_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_one_ro_y, preds_validation), 2)}')
preds_test = rf_cl_one_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**1% class balance random over**

Accuracy validation: 1.0
Accuracy test: 0.5


In [52]:
rf_cl_three_ru = RandomForestClassifier()
rf_cl_three_ru.fit(train_three_ru_X, train_three_ru_y.to_numpy().flatten())
printmd('**3% class balance random under**')
preds_validation = rf_cl_three_ru.predict(validation_three_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_ru_y, preds_validation), 2)}')
preds_test = rf_cl_three_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance random under**

Accuracy validation: 0.63
Accuracy test: 0.63


In [53]:
rf_cl_three_nmu = RandomForestClassifier()
rf_cl_three_nmu.fit(train_three_nmu_X, train_three_nmu_y.to_numpy().flatten())
printmd('**3% class balance near miss under**')
preds_validation = rf_cl_three_nmu.predict(validation_three_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_nmu_y, preds_validation), 2)}')
preds_test = rf_cl_three_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance near miss under**

Accuracy validation: 0.81
Accuracy test: 0.51


In [54]:
rf_cl_three_ro = RandomForestClassifier()
rf_cl_three_ro.fit(train_three_ro_X, train_three_ro_y.to_numpy().flatten())
printmd('**3% class balance random over**')
preds_validation = rf_cl_three_ro.predict(validation_three_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_three_ro_y, preds_validation), 2)}')
preds_test = rf_cl_three_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**3% class balance random over**

Accuracy validation: 1.0
Accuracy test: 0.5


In [56]:
rf_cl_five_ru = RandomForestClassifier()
rf_cl_five_ru.fit(train_five_ru_X, train_five_ru_y.to_numpy().flatten())
printmd('**5% class balance random under**')
preds_validation = rf_cl_five_ru.predict(validation_five_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_ru_y, preds_validation), 2)}')
preds_test = rf_cl_five_ru.predict(test_X)
print(f'Accurcay test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance random under**

Accuracy validation: 0.62
Accurcay test: 0.62


In [57]:
rf_cl_five_nmu = RandomForestClassifier()
rf_cl_five_nmu.fit(train_five_nmu_X, train_five_nmu_y.to_numpy().flatten())
printmd('**5% class balance near miss under**')
preds_validation = rf_cl_five_nmu.predict(validation_five_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_nmu_y, preds_validation), 2)}')
preds_test = rf_cl_five_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance near miss under**

Accuracy validation: 0.8
Accuracy test: 0.51


In [58]:
rf_cl_five_ro = RandomForestClassifier()
rf_cl_five_ro.fit(train_five_ro_X, train_five_ro_y.to_numpy().flatten())
printmd('**5% class balance random over**')
preds_validation = rf_cl_five_ro.predict(validation_five_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_five_ro_y, preds_validation), 2)}')
preds_test = rf_cl_five_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**5% class balance random over**

Accuracy validation: 1.0
Accuracy test: 0.5


In [59]:
rf_cl_twenty_five_ru = RandomForestClassifier()
rf_cl_twenty_five_ru.fit(train_twenty_five_ru_X, train_twenty_five_ru_y.to_numpy().flatten())
printmd('**25% class balance random under**')
preds_validation = rf_cl_twenty_five_ru.predict(validation_twenty_five_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_ru_y, preds_validation), 2)}')
preds_test = rf_cl_twenty_five_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance random under**

Accuracy validation: 0.65
Accuracy test: 0.63


In [61]:
rf_cl_twenty_five_nmu = RandomForestClassifier()
rf_cl_twenty_five_nmu.fit(train_twenty_five_nmu_X, train_twenty_five_nmu_y.to_numpy().flatten())
printmd('**25% class balance near miss under**')
preds_validation = rf_cl_twenty_five_nmu.predict(validation_twenty_five_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_nmu_y, preds_validation), 2)}')
preds_test = rf_cl_twenty_five_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance near miss under**

Accuracy validation: 0.86
Accuracy test: 0.59


In [62]:
rf_cl_twenty_five_ro = RandomForestClassifier()
rf_cl_twenty_five_ro.fit(train_twenty_five_ro_X, train_twenty_five_ro_y.to_numpy().flatten())
printmd('**25% class balance random over**')
preds_validation = rf_cl_twenty_five_ro.predict(validation_twenty_five_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_twenty_five_ro_y, preds_validation), 2)}')
preds_test = rf_cl_twenty_five_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**25% class balance random over**

Accuracy validation: 0.89
Accuracy test: 0.57


In [63]:
rf_cl_fifty_ru = RandomForestClassifier()
rf_cl_fifty_ru.fit(train_fifty_ru_X, train_fifty_ru_y.to_numpy().flatten())
printmd('**50% class balance random under**')
preds_validation = rf_cl_fifty_ru.predict(validation_fifty_ru_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_ru_y, preds_validation), 2)}')
preds_test = rf_cl_fifty_ru.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance random under**

Accuracy validation: 0.66
Accuracy test: 0.64


In [65]:
rf_cl_fifty_nmu = RandomForestClassifier()
rf_cl_fifty_nmu.fit(train_fifty_nmu_X, train_fifty_nmu_y.to_numpy().flatten())
printmd('**50% class balance near miss under**')
preds_validation = rf_cl_fifty_nmu.predict(validation_fifty_nmu_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_nmu_y, preds_validation), 2)}')
preds_test = rf_cl_fifty_nmu.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance near miss under**

Accuracy validation: 0.66
Accuracy test: 0.64


In [66]:
rf_cl_fifty_ro = RandomForestClassifier()
rf_cl_fifty_ro.fit(train_fifty_ro_X, train_fifty_ro_y.to_numpy().flatten())
printmd('**50% class balance random over**')
preds_validation = rf_cl_fifty_ro.predict(validation_fifty_ro_X)
print(f'Accuracy validation: {round(accuracy_score(validation_fifty_ro_y, preds_validation), 2)}')
preds_test = rf_cl_fifty_ro.predict(test_X)
print(f'Accuracy test: {round(accuracy_score(test_y, preds_test), 2)}')

**50% class balance random over**

Accuracy validation: 0.65
Accuracy test: 0.64
