<a href="https://colab.research.google.com/github/TheKerbecs/DeepLearning25/blob/main/week_1/CIFAR10-ShallowLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab 1: CIFAR10 Challenge

**CIFAR10** (http://www.cs.toronto.edu/~kriz/cifar.html) is one of the most famous ML data sets.

## Data
* 32x32 color images
* in 10 classes
* 50k training images
* 10k test images



<img src="https://production-media.paperswithcode.com/datasets/CIFAR-10-0000000431-b71f61c0_U5n3Glr.jpg" width=700>

In [1]:
#get data
from keras.datasets import cifar10
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step


In [2]:
#traindata: 50k 32X32 rgb images
X_train.shape

(50000, 32, 32, 3)

In [3]:
#labels
y_train

array([[6],
       [9],
       [9],
       ...,
       [9],
       [1],
       [1]], dtype=uint8)

## Task: build the best classifier (with feature extration) using the methods you know from ML1+2
* work in small teams (2-4)
* use NumPy pre-processing, feature extraction and hyer-parameter tuning in Scikit-Learn
* no Neural Networks!
* best test F1-Score winns!

In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import time
from sklearn.decomposition import PCA
import cv2
from skimage.feature import hog
from tqdm import tqdm

In [5]:
def extract_features(images):
    features = []

    features = images.reshape(images.shape[0], -1)
    print(f"Feature shape: {features.shape}")
    return features

In [6]:
X_train_features = extract_features(X_train)
X_test_features = extract_features(X_test)

Feature shape: (50000, 3072)
Feature shape: (10000, 3072)


In [7]:
def extractcolor_features(images):
    features = []

    features = images.reshape(images.shape[0],-1, 3)
    print(f"Feature shape: {features.shape}")
    return features

In [24]:
import cv2
def extract_hog_features(images):
    features = []
    for image in tqdm(images):
        # Convert the image to grayscale
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

        # Calculate HOG features
        hog_features = hog(gray_image, orientations=9, pixels_per_cell=(8, 8),
                    cells_per_block=(2, 2))

        features.append(hog_features)

    features = np.array(features)
    print(f"Feature shape: {features.shape}")
    return features


In [25]:
X_train_hog_features = extract_hog_features(X_train)
X_test_hog_features = extract_hog_features(X_test)

100%|██████████| 50000/50000 [00:24<00:00, 2022.95it/s]


Feature shape: (50000, 324)


100%|██████████| 10000/10000 [00:04<00:00, 2264.36it/s]


Feature shape: (10000, 324)


In [8]:
def dataframewithpixel(images):
    # DataFrame erstellen
    df = pd.DataFrame(images.reshape(images.shape[0], -1))

    # Spaltennamen setzen (optional, aber empfehlenswert)
    spaltennamen = [f'Pixel_{i}_{farbe}' for i in range(32*32) for farbe in ['R', 'G', 'B']]
    df.columns = spaltennamen
    return df

In [9]:
X_train_colorfeatures = extractcolor_features(X_train)
X_test_colorfeatures = extractcolor_features(X_test)

Feature shape: (50000, 1024, 3)
Feature shape: (10000, 1024, 3)


In [10]:
X_train_df = dataframewithpixel(X_train_colorfeatures)
X_test_df = dataframewithpixel(X_test_colorfeatures)


In [11]:
def berechne_helligkeit(rgb_array):
    """Berechnet die Helligkeit für ein Array von RGB-Werten."""
    return np.mean(rgb_array, axis=1)  # Mittelwert über die RGB-Kanäle

def berechne_saettigung(rgb_array):
    """Berechnet die Sättigung für ein Array von RGB-Werten."""
    min_rgb = np.min(rgb_array, axis=1)
    max_rgb = np.max(rgb_array, axis=1)
    saettigung = 1 - (min_rgb / max_rgb)
    saettigung[max_rgb == 0] = 0  # Vermeide Division durch 0
    return saettigung

def berechne_farbdifferenz(rgb_array1, rgb_array2):
    """Berechnet die Farbdifferenz zwischen zwei Arrays von RGB-Werten."""
    return np.sqrt(np.sum((rgb_array1 - rgb_array2)**2, axis=1))

def addnewfetures(df):
  # Alle Pixelindizes ermitteln
  pixel_indizes = set([int(spalte.split('_')[1]) for spalte in df.columns if spalte.startswith('Pixel_')])

  # Neue Features für Helligkeit, Sättigung und Farbdifferenz erstellen
  neue_features = []

  for pixel_index in pixel_indizes:
      # RGB-Werte für den aktuellen Pixel extrahieren
      rgb_werte = df[[f'Pixel_{pixel_index}_R', f'Pixel_{pixel_index}_G', f'Pixel_{pixel_index}_B']].values

      # Helligkeit, Sättigung und Farbdifferenz berechnen
      helligkeit = berechne_helligkeit(rgb_werte)
      saettigung = berechne_saettigung(rgb_werte)

      # Farbdifferenz zum nächsten Pixel (Beispiel)
      if pixel_index < max(pixel_indizes):
          next_pixel_index = pixel_index + 1
          next_rgb_werte = df[[f'Pixel_{next_pixel_index}_R', f'Pixel_{next_pixel_index}_G', f'Pixel_{next_pixel_index}_B']].values
          farbdifferenz = berechne_farbdifferenz(rgb_werte, next_rgb_werte)
      else:
          farbdifferenz = np.zeros(len(df))  # Letzter Pixel, keine Farbdifferenz

      # Neue Features als Series erstellen und in der Liste speichern
      neue_features.extend([
          pd.Series(helligkeit, name=f'Pixel_{pixel_index}_Helligkeit'),
          pd.Series(saettigung, name=f'Pixel_{pixel_index}_Saettigung'),
          pd.Series(farbdifferenz, name=f'Pixel_{pixel_index}_Farbdifferenz')
      ])

  # Alle neuen Features mit dem ursprünglichen DataFrame verketten
  df = pd.concat([df, pd.DataFrame(neue_features).T], axis=1)
  return df

In [12]:
X_train_df_newfeatures = addnewfetures(X_train_df)
X_test_df_newfeatures = addnewfetures(X_test_df)

  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_rgb / max_rgb)
  saettigung = 1 - (min_r

In [16]:
from google.colab import drive
drive.mount('/content/drive')

pfad = '/content/drive/My Drive/X_test_df_hex.csv'
X_test_df_hex.to_csv(pfad, index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

pfad = '/content/drive/My Drive/mein_dataframe.csv'
df.pd.read_csv(pfad)

In [28]:
from sklearn.model_selection import GridSearchCV

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [100,250],         # Number of trees
    'max_depth': [None,50 ],                 # Maximum depth of each tree
    'max_features': ['sqrt', 'log2'],           # Number of features to consider when looking for the best split
    'criterion': ["gini","entropy","log_loss"]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=5, scoring='f1', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_hog_features, y_train)

# Best parameters and model
print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test_features)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits




KeyboardInterrupt: 

In [27]:


rf_classifier = RandomForestClassifier(
    n_estimators= 200,
    n_jobs=-1,
    random_state=42
)

rf_classifier.fit(X_train_hog_features, y_train)

# Evaluate the model
print("Evaluating model...")
y_pred = rf_classifier.predict(X_test_hog_features)

# Calculate metrics
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.4f}")

# Print detailed classification report
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

  return fit_method(estimator, *args, **kwargs)


Evaluating model...
F1 Score: 0.5216

Classification Report:
              precision    recall  f1-score   support

    airplane       0.62      0.62      0.62      1000
  automobile       0.59      0.67      0.63      1000
        bird       0.50      0.37      0.43      1000
         cat       0.39      0.28      0.33      1000
        deer       0.44      0.46      0.45      1000
         dog       0.41      0.46      0.43      1000
        frog       0.50      0.65      0.57      1000
       horse       0.60      0.53      0.57      1000
        ship       0.61      0.60      0.60      1000
       truck       0.57      0.60      0.59      1000

    accuracy                           0.53     10000
   macro avg       0.52      0.53      0.52     10000
weighted avg       0.52      0.53      0.52     10000



In [None]:
from sklearn.feature_selection import RFE

X_train_all = np.concatenate((X_train_df_newfeatures, X_train_hog_features), axis=1)
X_test_all = np.concatenate((X_test_df_newfeatures, X_test_hog_features), axis=1)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rfe = RFE(estimator=rf_classifier, n_features_to_select=200)

rfe.fit(X_train_all, y_train)

selected_features = df.columns[rfe.support_]
print(selected_features)

df_selected = X_train_all[selected_features]
df_testselected = X_test_all[selected_features]

rf_classifier.fit(df_selected, y_train)

print("Evaluating model...")
y_pred = rf_classifier.predict(df_testselected)

# Calculate metrics
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score: {f1:.4f}")

# Print detailed classification report
class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=class_names))

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **