------------------------------------------------****Image Classification****-------------------------------------------------------------------

First, we import the required libraries

In [1]:
import os
from skimage.io import imread, imsave
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from cuml.svm import SVC
import cv2
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
print("Load environment successfully")

Load environment successfully


Then, we doubled the sample size by inverting it horizontally, and save the datas and labels to new file. 

In [3]:
#create a new folder to store the flipped images
os.makedirs('train_ims_rever', exist_ok=True)

# get all image names
image_names = os.listdir('train_ims')
# 
# flip images horizontally, add prefix and save them to the new folder by for loop
for img_name in image_names:
    img_path = os.path.join('train_ims', img_name)
    image = imread(img_path)
    image_reversed = np.fliplr(image)  # flip horizontally
    reversed_img_name = 'reverse_' + img_name  # add prefix to distinguish reversed images
    reversed_img_path = os.path.join('train_ims_rever', reversed_img_name)
    imsave(reversed_img_path, image_reversed)

print("All images have been horizontally flipped and saved to 'train_ims_rever' folder with 'reverse_' prefix.")

# read the original label file
df = pd.read_csv('train.csv')

# create a new label file for the flipped images
df_reversed = df.copy()
df_reversed['im_name'] = 'reverse_' + df_reversed['im_name']  # add prefix to image names

# combine the original and flipped labels
df_combined = pd.concat([df, df_reversed], ignore_index=True)

# save the combined label file
df_combined.to_csv('train_combined.csv', index=False)

print("Labels for flipped images have been added to 'train_combined.csv'.")

  imsave(reversed_img_path, image_reversed)


All images have been horizontally flipped and saved to 'train_ims_rever' folder with 'reverse_' prefix.
Labels for flipped images have been added to 'train_combined.csv'.


Initialize labels and image names and define optimal HOG parameters

In [2]:
# read the combined label file
df = pd.read_csv('train_combined.csv')
labels = df['label'].values
image_names = df['im_name'].values

# encode the labels
le = LabelEncoder()
labels = le.fit_transform(labels)

best_params = {
    'resize_size': 128,
    'pixels_per_cell': (16, 16),
    'cells_per_block': (4, 4),
    'orientations': 10,
    'C': 10,
    'kernel': 'rbf'
}

hog = cv2.HOGDescriptor(_winSize=(best_params['resize_size'], best_params['resize_size']),
                        _blockSize=(best_params['cells_per_block'][1] * best_params['pixels_per_cell'][1],
                                    best_params['cells_per_block'][0] * best_params['pixels_per_cell'][0]),
                        _blockStride=(best_params['pixels_per_cell'][1], best_params['pixels_per_cell'][0]),
                        _cellSize=(best_params['pixels_per_cell'][1], best_params['pixels_per_cell'][0]),
                        _nbins=best_params['orientations'])
print("Successfully read the file")


Successfully read the file


Read images in different paths and get the features of the file according to the HOG function. 
Standardize the data by scaling the features of the data to a standard normal distribution with a mean of 0 and a standard deviation of 1 to improve the performance and training speed of the model. 
Training the SVM model

In [3]:
features = []
print("Start extracting features...")
for img_name in image_names:
    if 'reverse_' in img_name:
        img_path = os.path.join('train_ims_rever', img_name)
    else:
        img_path = os.path.join('train_ims', img_name)
    image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (best_params['resize_size'], best_params['resize_size']))
    hog_features = hog.compute(image).flatten()
    features.append(hog_features)
    
features = np.array(features)
print("features size: ", features.shape)
del image, hog_features
gc.collect()

X_train, _, y_train, _ = train_test_split(features, labels, test_size=0.15, random_state=42)
del features
gc.collect()

print('Start preprocessing data...')
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
del X_train
gc.collect()

svm = SVC(C=best_params['C'], kernel=best_params['kernel'], probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
print("Already trained SVM model")
del X_train_scaled, y_train
gc.collect()

Start extracting features...
features size:  (100000, 4000)
Start preprocessing data...
[W] [16:27:43.832240] Random state is currently ignored by probabilistic SVC
Already trained SVM model


770

Read the images and labels of the test file, and use the HOG function to obtain the features of the images, standardize the features, and use them for testing.
Save the result to test.csv

In [4]:
# read the test label file
test_names_df = pd.read_csv('test.csv')
test_names = test_names_df['im_name'].values

test_features = []
for test_name in test_names:
    test_path = os.path.join('test_ims', test_name)
    img = cv2.imread(test_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, (128, 128))
    hog_features = hog.compute(img)
    test_features.append(hog_features)

test_features = np.array(test_features)
print("Already get test features")

test_features_scaled = scaler.transform(test_features)

print("Before prediction")
test_predictions = svm.predict(test_features_scaled)
print("After prediction")

# save the predictions to a csv file
test_names_df['label'] = le.inverse_transform(test_predictions)
test_names_df.to_csv('test.csv', index=False)

print("Predictions saved to test.csv")

Already get test features
Before prediction
After prediction
Predictions saved to test.csv
