How parameter selection affects model quality?

#### Preproccesing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path

train_dir = Path('./data/train')
train_filepaths = list(train_dir.glob(r'**/*.jpg'))

test_dir = Path('./data/test')
test_filepaths = list(test_dir.glob(r'**/*.jpg'))

val_dir = Path('./data/validation')
val_filepaths = list(val_dir.glob(r'**/*.jpg'))

In [2]:
def paths_to_dataframe(path):
    labels = []
    for i in range(len(path)):
        labels.append(str(path[i]).split(os.sep)[-2])

    labels = pd.Series(labels, name='Label')
    path = pd.Series(path, name='Path').astype(str)

    df = pd.concat([path, labels], axis=1)

    df = df.sample(frac=1).reset_index(drop = True)

    return df
    
train_df = paths_to_dataframe(train_filepaths)
test_df = paths_to_dataframe(test_filepaths)
val_df = paths_to_dataframe(val_filepaths)

test_df.head()
print()




In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

def metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Accuracy: ", accuracy)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1: ", f1)

    classification_rep = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(classification_rep)

    classes = np.unique(y_test)
    class_accuracy = {}
    for cls in classes:
        indices = np.where(y_test == cls)[0]
        class_accuracy[cls] = accuracy_score(y_test[indices], np.array(y_pred)[indices])

    return accuracy, precision, recall, f1, class_accuracy

In [4]:
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

def extract_features(img_path, model):
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    features = model.predict(x)
    return features.flatten()

y_test = test_df['Label']
y_train = train_df['Label']

#### Nested Dichotomy

In [13]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

def nested_dichotomy(X_train, y_train, X_test, param_grid, method):
    unique_labels = np.unique(y_train)
    classifiers = []

    for i in unique_labels:
        y_binary = np.where(y_train == i, 1, 0)

        balancer = RandomOverSampler(random_state=42)
        X_train_balanced, y_train_balanced = balancer.fit_resample(X_train, y_binary)

        if method == 'GridSearchCV':
            search_method = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
        elif method == 'RandomizedSearchCV': 
            search_method = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)
        elif method == 'BayesSearchCV':
            search_method = BayesSearchCV(DecisionTreeClassifier(), param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)

        search_method.fit(X_train_balanced, y_train_balanced)
        classifiers.append((search_method, i))
        print(i)

    predictions = []
    for tree, positive_label in classifiers:
        binary_prediction = tree.best_estimator_.predict(X_test)
        predictions.append(np.where(binary_prediction == 1, positive_label, None))

    results = ['tomato' for _ in range(len(X_test))]
    for i in range(len(predictions)):
        for j in range(len(X_test)):
            if predictions[i][j] is not None:
                results[j] = unique_labels[i]
        

    return results, predictions, classifiers

In [6]:
from tensorflow.keras.applications import MobileNetV2

mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')
X_train = np.array([extract_features(img_path, mobilenet_model) for img_path in train_df['Path']])
X_test = np.array([extract_features(img_path, mobilenet_model) for img_path in test_df['Path']])

  mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, pooling='avg')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10],
}

param_methods = (GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1),
                 RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_grid, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42),
                 BayesSearchCV(DecisionTreeClassifier(), param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1),)

results_dict = {}

for method in param_methods:
    print( method.__class__.__name__)
    results, pred, models = nested_dichotomy(X_train, y_train, X_test, param_grid, method.__class__.__name__)
    accuracy, precision, recall, f1, class_accuracy = metrics(y_test, results)

    results_dict[method.__class__.__name__] = {
    'results': results,
    'pred': pred,
    'models': models,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'class_accuracy': class_accuracy,
}

GridSearchCV
apple
banana
beetroot
bell pepper
cabbage
capsicum
carrot
cauliflower
chilli pepper
corn
cucumber
eggplant
garlic
ginger
grapes
jalepeno
kiwi
lemon
lettuce
mango
onion
orange
paprika
pear
peas
pineapple
pomegranate
potato
raddish
soy beans
spinach
sweetcorn
sweetpotato
tomato
turnip
watermelon
Accuracy:  0.9491017964071856
Precision:  0.9617210384425952
Recall:  0.9491017964071856
F1:  0.9473758113026017
Classification Report:
               precision    recall  f1-score   support

        apple       1.00      0.56      0.71         9
       banana       1.00      0.78      0.88         9
     beetroot       0.91      1.00      0.95        10
  bell pepper       1.00      1.00      1.00         9
      cabbage       0.91      1.00      0.95        10
     capsicum       1.00      1.00      1.00         9
       carrot       1.00      1.00      1.00         7
  cauliflower       1.00      0.89      0.94         9
chilli pepper       1.00      1.00      1.00         7
     

In [16]:
print(results_dict['GridSearchCV']['accuracy'])
print(results_dict['RandomizedSearchCV']['accuracy'])
print(results_dict['BayesSearchCV']['accuracy'])

0.9491017964071856
0.9461077844311377
0.9491017964071856
