# Image classification

### Features extraction using VGG16 and classification using decision tree (lightGBM)

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import cv2
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
from keras.utils import plot_model

#### Preparation of the training datasets

In [None]:
# Define the data augmentation functions
def random_rotation(image):
    angle = np.random.uniform(-20, 20)
    height, width = image.shape[:2]
    rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height), borderMode=cv2.BORDER_REFLECT)
  #  print("rot",rotated_image.shape)
    return rotated_image

def random_shift(image):
    width_shift = np.random.uniform(-0.05, 0.05) * image.shape[1]
    height_shift = np.random.uniform(-0.05, 0.05) * image.shape[0]
    translation_matrix = np.float32([[1, 0, width_shift], [0, 1, height_shift]])
    shifted_image = cv2.warpAffine(image, translation_matrix, (image.shape[1], image.shape[0]))
 #   print("shift",shifted_image.shape)
    return shifted_image

def random_shear(image):
    shear_angle = np.random.uniform(-0.05, 0.05)
    height, width = image.shape[:2]
    shear_matrix = np.float32([[1, shear_angle, 0], [0, 1, 0]])
    sheared_image = cv2.warpAffine(image, shear_matrix, (width, height), borderMode=cv2.BORDER_REFLECT)
#    print("shear",sheared_image.shape)
    return sheared_image

def horizontal_flip(image):
    return cv2.flip(image, 1)

In [None]:
# Path toward the training data
#data_path = '/kaggle/input/img-dataset/img_spamassassin/train'

#data_path = '/kaggle/input/img-dataset/img_fusion/train'
#data_path = '/kaggle/input/img-dataset/img_fusion2/train'

#data_path = '/kaggle/input/img-dataset/img_duo/train'
#data_path = '/kaggle/input/img-dataset/img_duo2/train'
data_path = '/kaggle/input/img-dataset/img_duo3/train'

# Initialization of the lists for the images (X) and the labels (y)
X = []
y = []

# Walk through the base folder that contains folders that represents the different labels
mail_categories = os.listdir(data_path)
label_encoder = LabelEncoder()

for category in tqdm(mail_categories):
    # Get the list of the images in each folder
    files = os.listdir(os.path.join(data_path, category))
    
    for file in files:
        # Load one image and resize it
        image = cv2.imread(os.path.join(data_path, category, file))
        image = cv2.resize(image, (224, 224))
        
        # Possibly apply random transformations
        if np.random.rand() < 0.5:
            image = random_rotation(image)
        if np.random.rand() < 0.5:
            image = random_shift(image)
        if np.random.rand() < 0.5:
            image = random_shear(image)
        if np.random.rand() < 0.5:
            image = horizontal_flip(image)

        # Add the image to the list X and the label to the list y
        X.append(image)
        y.append(category)

# Convert the lists X and y in numpy arrays
X = np.array(X)
y = np.array(y)

In [None]:
# Encode the labels
y = label_encoder.fit_transform(y)
# Transform the labels in binary vectors
y = to_categorical(y)

# From the training dataset, create a subset for the training and one for the validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Show the dimensions and number of the training data
print(X_train.shape, y_train.shape)

# Show the dimensions and number of the validation data
print(X_val.shape, y_val.shape)

In [None]:
import matplotlib.pyplot as plt

# Show the first image of the training subset
plt.imshow(X_train[0, :, :, :])
plt.axis('off')
plt.show()

#### Features extraction

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
import tensorflow as tf
import numpy as np
import lightgbm as lgb


# Check for available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Set memory growth for each GPU
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

    # Load VGG16 model using GPU
    with tf.device('/GPU:0'):  # Specify the GPU index
        base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

        # Add Global Average Pooling layer to reduce dimensions
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        model = Model(inputs=base_model.input, outputs=x)

        
# Subfunction to extract features using VGG16
def extract_features(images):
    preprocessed_images = np.array(images)
    features = model.predict(preprocessed_images)
    return features


# Export the model as an image
plot_model(model, to_file='vgg_model.png', show_shapes=True, show_layer_names=True)


# Extraction of features from the subsets
X_train_features = extract_features(X_train)
X_val_features = extract_features(X_val)

In [None]:
print("Training features:",X_train_features.shape)
print("Validation features:",X_val_features.shape)

In [None]:
y_train = np.argmax(y_train, axis=1)
y_val = np.argmax(y_val, axis=1)

In [None]:
unique_elements, counts = np.unique(y_train, return_counts=True)

for value, count in zip(unique_elements, counts):
    print(f"The label {value} is found {count} times in the training subset.")

#### Classification using LightGBM

In [None]:
"""
from sklearn.model_selection import GridSearchCV

# Use of grid search for hyperparameters finetuning
# num_iterations does not matter because we'll be using early stopping
param_grid = {
    'num_iterations': 2500,
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 5, 7]
}

lgb_model = lgb.LGBMClassifier()
grid_search = GridSearchCV(lgb_model, param_grid, cv=3)
grid_search.fit(X_train_features, y_train)

print("Best parameters:", grid_search.best_params_)
"""

In [None]:
"""
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

# use of random search for hyperparameters finetuning
param_dist = {
    'num_iterations': 2500,
    'learning_rate': uniform(0.001, 0.2),
    'max_depth': randint(3, 10)
}

lgb_model = lgb.LGBMClassifier()
random_search = RandomizedSearchCV(lgb_model, param_distributions=param_dist, n_iter=10, cv=3)
random_search.fit(X_train_features, y_train)

print("Best parameters:", random_search.best_params_)
"""

In [None]:
# Definition of the LightGBM model using the finetuned hyperparameters
lgb_model = lgb.LGBMClassifier(
#    verbose=1,
    num_iterations=2500,
    learning_rate=0.1241,
    eval_metric='cross_entropy',
    max_depth=4,
)

# Training of the model on the features extracted from the trining subset and evaluation of the model using the validation subset
lgb_model.fit(
    X_train_features, y_train,
    eval_set=[(X_val_features, y_val),(X_train_features, y_train)],
    early_stopping_rounds=200,
#    verbose=True
)

In [None]:
# Evaluation of the model on the validation subset
accuracy = lgb_model.score(X_val_features, y_val)
print(f"Accuracy of the LightGBM model on the validation subset using the extracted features : {accuracy}")

In [None]:
import sklearn.datasets, sklearn.model_selection

# Get a graph showing the evolution of the loss during the training
lgb.plot_metric(lgb_model)

In [None]:
# Show 2 of the decision trees of the random forest
fig, ax = plt.subplots(nrows=2, figsize=(16,8), sharex=True)
lgb.plot_tree(lgb_model, tree_index=0,dpi=300, ax=ax[0])
lgb.plot_tree(lgb_model, tree_index=1,dpi=300, ax=ax[1])

#### Test of the model's performances

In [None]:
import os
import cv2
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm


# Path toward the test data
#data_path = '/kaggle/input/img-dataset/img_spamassassin/test'

#data_path = '/kaggle/input/img-dataset/img_duo/test'
#data_path = '/kaggle/input/img-dataset/img_duo2/test'
data_path = '/kaggle/input/img-dataset/img_duo3/test'

#data_path = '/kaggle/input/img-dataset/img_fusion/test'
#data_path = '/kaggle/input/img-dataset/img_fusion2/test'


# Initialization of the list for the images (X) and the labels (y)
X_test = []
y_test = []

# Walk through the base folder that contains folders that represents the different labels
mail_categories = os.listdir(data_path)
label_encoder = LabelEncoder()

for category in tqdm(mail_categories):
    # Get the list of the images in each folder
    files = os.listdir(os.path.join(data_path, category))
    
    for file in files:
        # Load one image and resize it
        image = cv2.imread(os.path.join(data_path, category, file))
        image = cv2.resize(image, (224, 224))
        
        # Possibly apply random transformations
        if np.random.rand() < 0.5:
            image = random_rotation(image)
        if np.random.rand() < 0.5:
            image = random_shift(image)
        if np.random.rand() < 0.5:
            image = random_shear(image)
        if np.random.rand() < 0.5:
            image = horizontal_flip(image)

        # Add the image to the list X and the label to the list y
        X_test.append(image)
        y_test.append(category)

        
# Convert the lists X and y in numpy arrays
X_test = np.array(X_test)
y_test = np.array(y_test)


# Encode the labels
y_test = label_encoder.fit_transform(y_test)
# Transform the labels into a binary vector
y_test = to_categorical(y_test)

In [None]:
# Extract features from the test dataset using VGG16
X_test_features = extract_features(X_test)
y_test = np.argmax(y_test, axis=1)

In [None]:
# Test the model on the test dataset
accuracy = lgb_model.score(X_test_features, y_test)
print(f"Accuracy of the LightGBM model on the test dataset with the extracted features : {accuracy}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

target_names = ['Ham','Spam']

predictions = lgb_model.predict(X_test_features)

# Show the confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues') 
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# Get the values of differents evalutation metrics
print('Classification Report')
print(classification_report(y_test, predictions, target_names=target_names,digits=4))