How data balancing methods affect model quality?

In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import (KFold, RandomizedSearchCV,
                                     cross_val_score, train_test_split)
from sklearn.tree import DecisionTreeClassifier

train_dir = Path('./data/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path('./data/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path('./data/validation')
val_filepaths = list(val_dir.glob(r'**/*.jpg'))

aug_dir = Path('./data/augmented/')
aug_filepaths = list(aug_dir.glob(r'**/*.jpg')) + list(aug_dir.glob(r'**/*.jpeg'))

data = train_filepaths + test_filepaths + val_filepaths + aug_filepaths

In [None]:
def paths_to_dataframe(path):
    labels = []
    for i in range(len(path)):
        labels.append(str(path[i]).split(os.sep)[-2])

    labels = pd.Series(labels, name='Label')
    path = pd.Series(path, name='Path').astype(str)

    df = pd.concat([path, labels], axis=1)

    df = df.sample(frac=1).reset_index(drop = True)

    return df
    
data_df = paths_to_dataframe(data)

In [None]:
data_df = data_df.sample(frac=0.6, random_state=42)
data_df.shape

In [None]:
# X, X_test, y, y_test = train_test_split(data_df['Path'], data_df['Label'], test_size=0.2, random_state=42)
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

X = np.array(data_df['Path'])
y = np.array(data_df['Label'])
# X_test = np.array(X)
# y_test = np.array(y)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)

    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(classification_rep)

    classes = np.unique(y_test)
    class_accuracy = {}
    for cls in classes:
        indices = np.where(y_test == cls)[0]
        class_accuracy[cls] = accuracy_score(y_test[indices], np.array(y_pred)[indices])

    file_path = './wynik.txt'
    with open(file_path, 'w') as file:
        file.write(str(accuracy))

    return accuracy, precision, recall, f1, class_accuracy

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from tensorflow.keras.applications import MobileNetV2

def nested_dichotomy(X_train, y_train, X_test, balancer=None):
    unique_labels = np.unique(y_train)
    num_classes = len(unique_labels)
    classifiers = []
    classified_indices = np.full(len(X_train), False)

    for i in unique_labels:
        y_binary = np.where(y_train == i, 1, 0)
        print(np.unique(y_binary))
        tree = DecisionTreeClassifier()

        if balancer:
            X_train_balanced, y_train_balanced = balancer.fit_resample(X_train, y_binary)
        else:
            X_train_balanced = X_train
            y_train_balanced = y_train

        tree.fit(X_train_balanced, y_train_balanced)
        classifiers.append((tree, i))
        print(i)

    predictions = []
    for tree, positive_label in classifiers:
        binary_prediction = tree.predict(X_test)
        predictions.append(np.where(binary_prediction == 1, positive_label, None))

    results = ['tomato' for _ in range(len(X_test))]
    for i in range(len(predictions)):
        for j in range(len(X_test)):
            if predictions[i][j] is not None:
                results[j] = unique_labels[i]
        

    return results, predictions, classifiers

In [None]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

def extract_features(img_path, model):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    return features.flatten()

In [None]:
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks, RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from tensorflow.keras.applications import MobileNetV2


def experiment_loop():

    balancers = {
        "None": None,
        "RandomOverSampler": RandomOverSampler(random_state=42),
        "SMOTE": SMOTE(random_state=42),
        "RandomUnderSampler": RandomUnderSampler(random_state=42),
        "TomekLinks": TomekLinks(),
        "SMOTETomek": SMOTETomek(random_state=42)
    }
    
    scores = [[] for _ in range(k)]

    mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')
    X_ext = np.array([extract_features(img_path, mobilenet_model) for img_path in X])

    for fold, (train_index, val_index) in enumerate(kf.split(X)):
        X_train, X_val = X_ext[train_index], X_ext[val_index]
        y_train, y_val = y[train_index], y[val_index]

        for balancer_name, balancer in balancers.items():
            print(balancer_name)
            results, pred, models = nested_dichotomy(X_train, y_train, X_val, balancer)
            accuracy, precision, recall, f1, class_accuracy = metrics(y_val, results)
            scores[fold].append((accuracy, precision, recall, f1, class_accuracy))

    return scores

scores = experiment_loop()


In [None]:
# from imblearn.combine import SMOTETomek
# from imblearn.under_sampling import TomekLinks, RandomUnderSampler
# from imblearn.over_sampling import RandomOverSampler, SMOTE
# from tensorflow.keras.applications import MobileNetV2

# balancers = (RandomOverSampler(random_state=42), SMOTE(random_state=42), 
#             RandomUnderSampler(random_state=42), TomekLinks(), SMOTETomek(random_state=42))

# mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')
# X_train = np.array([extract_features(img_path, mobilenet_model) for img_path in train_df['Path']])
# X_test = np.array([extract_features(img_path, mobilenet_model) for img_path in test_df['Path']])

# models_dict = {}

# for balancer in balancers:
#     print(str(balancer))
#     results, pred, models = nested_dichotomy(X_train, y_train, X_test, balancer)
#     accuracy, precision, recall, f1, class_accuracy = metrics(y_test, results)

#     models_dict[str(balancer)] = (results, pred, models, accuracy, precision, recall, f1, class_accuracy)

In [None]:
models_dict[str(balancer)] = (results, pred, models, accuracy, precision, recall, f1, class_accuracy)

plt.figure(figsize=(18, 6))

rand_over_res = models_dict['RandomOverSampler(random_state=42)'][7] ## class_accuracy
plt.subplot(2, 3, 1)
plt.bar(rand_over_res.keys(), rand_over_res.values(), color='skyblue')
plt.title('Accuracy for Each Class (RandomOverSampler)')
plt.xlabel('Classes')
plt.ylabel('Accuracy')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

smote_res = models_dict['SMOTE(random_state=42)'][7] ## class_accuracy
plt.subplot(2, 3, 2)
plt.bar(smote_res.keys(), smote_res.values(), color='skyblue')
plt.title('Accuracy for Each Class (SMOTE)')
plt.xlabel('Classes')
plt.ylabel('Accuracy')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

rand_under_res = models_dict['RandomUnderSampler(random_state=42)'][7] ## class_accuracy
plt.subplot(2, 3, 3)
plt.bar(rand_under_res.keys(), rand_under_res.values(), color='skyblue')
plt.title('Accuracy for Each Class (RandomUnderSampler)')
plt.xlabel('Classes')
plt.ylabel('Accuracy')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

tomek_res = models_dict['TomekLinks()'][7] ## class_accuracy
plt.subplot(2, 3, 4)
plt.bar(tomek_res.keys(), tomek_res.values(), color='skyblue')
plt.title('Accuracy for Each Class (TomekLinks)')
plt.xlabel('Classes')
plt.ylabel('Accuracy')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

smotetomek_res = models_dict['SMOTETomek(random_state=42)'][7] ## class_accuracy
plt.subplot(2, 3, 5)
plt.bar(smotetomek_res.keys(), smotetomek_res.values(), color='skyblue')
plt.title('Accuracy for Each Class (SMOTETomek)')
plt.xlabel('Classes')
plt.ylabel('Accuracy')
plt.xticks(rotation=90, ha='right')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
methods = ['RandomOverSampler', 'SMOTE', 'RandomUnderSampler', 'TomekLinks()', 'SMOTETomek']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['skyblue', 'lightgreen', 'salmon', 'orange', 'orchid']

rand_over_res = models_dict['RandomOverSampler(random_state=42)']
smote_res = models_dict['SMOTE(random_state=42)']
rand_under_res = models_dict['RandomUnderSampler(random_state=42)']
tomek_res = models_dict['TomekLinks()']
smotetomek_res = models_dict['SMOTETomek(random_state=42)']

metrics_data = {
    'Accuracy': [rand_over_res[3], smote_res[3], rand_under_res[3], tomek_res[3], smotetomek_res[3]],
    'Precision': [rand_over_res[4], smote_res[4], rand_under_res[4], tomek_res[4], smotetomek_res[4]],
    'Recall': [rand_over_res[5], smote_res[5], rand_under_res[5], tomek_res[5], smotetomek_res[5]],
    'F1-Score': [rand_over_res[6], smote_res[6], rand_under_res[6], tomek_res[6], smotetomek_res[6]]
}

plt.figure(figsize=(14, 10))

for i, metric in enumerate(metrics):
    ax = plt.subplot(2, 2, i+1)
    bars = plt.bar(methods, metrics_data[metric], color=colors)
    plt.title('Average ' + metric)
    plt.xlabel('Feature extraction method')
    plt.ylabel(metric)
    plt.ylim(0, 1)
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    for bar in bars:
        height = bar.get_height()
        ax.annotate('{}'.format(round(height, 2)),
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
df = pd.DataFrame(metrics_data, index=methods)
print("Metrics Data:")
print(df)