# COMP5318 Assignment 1 - Clothes Classification

**Name**: Xing Xing <br />
**Title**: Comp5318 Assignment1 <br />
**ID**: 500390560 <br />
**Start Date**: 31 Aug 2021 <br />
**Finish Date**: 10 Sep 2021 <br />

**Applied algorithms**: <br />
    1. KNN, <br />
    2. SVM, <br />
    3. Logistic regression

# 1. Import Packages and system information

In [25]:
# import cv2
import pandas as pd
import numpy as np
import platform
import h5py
import time
import math

# ignore warnings for clean output 
import warnings
warnings.filterwarnings('ignore')

#===================================
# Hardware and software information 
#===================================
print("===== System Information =====")

import platform
import psutil

info = platform.uname()

print("- System: Mac " + info.system)
print("- Machine: "+info.machine)
print("- Processor: " + info.processor)





===== System Information =====
- System: Mac Darwin
- Machine: x86_64
- Processor: i386


# 2. Load Data
load_train_data(): Load 30000 training data    <br />
load_train_label(): Load 30000 training labels <br />
load_test_data(): Load all 5000 test data    <br />
load_test_labels(): Load all 5000 test labels    <br />

In [3]:
# load training data from ./Input/train

def load_train_data():
    with h5py.File('./Input/train/images_training.h5','r') as H:
        train_data = np.copy(H['datatrain'])
    return train_data

def load_train_label():
    with h5py.File('./Input/train/labels_training.h5','r') as H:
        train_label= np.copy(H['labeltrain'])
    return train_label

# load testing data from ./Input/test

def load_test_data():
    with h5py.File('./Input/test/images_testing.h5','r') as H:
        test_data = np.copy(H['datatest'])
    return test_data
        
def load_test_label():
    with h5py.File('./Input/test/labels_testing_2000.h5','r') as H:
        test_label= np.copy(H['labeltest'])
    return test_label


#===========
# load data
#===========
print("========================================")
print(" load_data(): Loading data")

# load testing data
test_data = load_test_data()
test_label = load_test_label()

# load training data
train_data = load_train_data()
train_label = load_train_label()

print(" load_data(): Loading complete")
print("========================================")

 load_data(): Loading data
 load_data(): Loading complete


# 3. Preprocessing and preparation functions

Preprocessing Functions:<br />
    1. show_one_image(row, n): showing 1d np array as an n*n image with pyplot <br />
    2. STD(data): standardise entire data <br />
    3. NORM(data): normalisation of entire data<br /> 
    4. PCA(data, n): dimension deduction to <br />
    5. preprocessing: data -> STD data -> NORM data -> PCA data -> ready to train

In [9]:
# Show an image
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer

#=================
# Show Image
#=================

# Showing one image
def show_one_img(row, n = 28):
    # reshape to n*n 2d np array
    img = np.reshape(row, (n, n))
    plt.imshow(img, cmap=plt.get_cmap('gray'))
    plt.show()
    
#=================
# Preprocessing
#=================

# Standardise entire data
def STD(data):
    print("   - STD (): start standardise ...")
    scaler = StandardScaler()
    scaler.fit(data)
    return scaler.transform(data)

# Normalise entire data 
def NORM(data):
    print("   - NORM (): start normalise ...")
    transformer = Normalizer().fit(data)
    return transformer.transform(data)

#=====================
# Dimension deduction
#=====================

# PCA dimension deduction with n*n size
def PCA_deduction(data, n):
    if (n > 1):
        pca = PCA(n_components = n).fit(data)
        data = pca.fit_transform(data)
    return pca

# find best pca parameter
def find_best_pca():
    # find out best n_componenets for PCA
    n_comp = [0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.98]
    explain_list = []
    dim_list = []
    for i in n_comp:
        pca = PCA(n_components = i).fit(train_data)
        # cumsum variance explained 
        explain_list.append(np.sum(pca.explained_variance_))
        dim_list.append(pca.n_components_)

    # show elbow image
    plt.plot(dim_list, explain_list, color = "Blue", linewidth = 1.5)
    plt.show()
    
    # As we could see from image below, the elbow position is between 

#==========================
# Preprocess training data
#==========================
print("===================================================")
print("preprocessing(): start preprocessing train_data ...")

# Generate pca deductor = pca
# (deduct train and test seperately prevent data leak)

# standardisation on train_data
std_train_data = STD(train_data)

# normalisation on train_data
norm_train_data = NORM(std_train_data)

print("   - PCA(): start generting pca n = 0.90")
pca = PCA(n_components = 0.90).fit(norm_train_data)

print("   - PCA(): start PCA transform on train")
clean_train_data = pca.transform(norm_train_data)

#==========================
# Preprocess testing data
#==========================
print("preprocessing(): start preprocessing test_data ...")

# standardisation on test_data
std_test_data = STD(test_data)

# normalisation on test_data
norm_test_data = NORM(std_test_data)

# pca deduction on test_data
print("   - PCA(): start PCA transform")
clean_test_data = pca.transform(norm_test_data)


# check data shape
print("preprocessing(): clean up ...")
print("   - clean_train_data shape:" + str(clean_train_data.shape))
print("   - clean_test_data shape:" + str(clean_test_data.shape))

# finish preprocessing 
print("preprocessing(): preprocessing complete ...")
print("===================================================")



preprocessing(): start preprocessing train_data ...
   - STD (): start standardise ...
   - NORM (): start normalise ...
   - PCA(): start generting pca n = 0.90
   - PCA(): start PCA transform on train
preprocessing(): start preprocessing test_data ...
   - STD (): start standardise ...
   - NORM (): start normalise ...
   - PCA(): start PCA transform
preprocessing(): clean up ...
   - clean_train_data shape:(30000, 131)
   - clean_test_data shape:(5000, 131)
preprocessing(): preprocessing complete ...


# 4. Training Models 
1. KNN, <br />
2. SVM, <br />
3. Logistic Regression, <br />

In [10]:
# Helper functions and libraries 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

# 4.1 KNN

In [12]:
from sklearn.neighbors import KNeighborsClassifier

#================
# KNN Classifier 
#================
def KNN(train_data, train_label, test_data, test_label, para):
    # record start time
    start_time = time.time()
    
    # build classifier and training
    knn = KNeighborsClassifier(n_neighbors = para['n_neighbors'], metric = para['metric'], weights = para['weights'])
    knn.fit(train_data, train_label)
    
     # testing
    result = knn.predict(test_data)
    
    # record end time
    end_time = time.time()
    # record runing time
    print("KNN(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    
    # individual model result
    score = accuracy_score(result[0:2000], test_label)
    print("KNN(): test result: " + str(score*100) + "%\n")
    
    return result



In [8]:
#============================
# KNN Stuning hyperparameter
#============================

# find best KNN parameters use grid search with 10-fold stratified cross validation
def KNN_stun(train_data, train_label):
    print("=============================")
    print("KNN_stun(): Start KNN stun...")
    
    start_time = time.time()
    
    # select possible and reasonable parameters
    K = list(range(3,9))
    metric = ["minkowski","euclidean"]
    weights = ["distance", "uniform"]
    
    # aggregate parameters (K, metric, weight)
    param = { 'n_neighbors': K, 'metric': metric, 'weights': weights }
    
    # generate empty classifier 
    knn = KNeighborsClassifier(n_jobs = 8)
    
    # cv as parameter is default stratified n fold cross validation 
    GS_CV = GridSearchCV(estimator=knn, param_grid=param, scoring='accuracy', cv = 10, refit=True)
    
    # train models with different parameters 
    GS_CV.fit(train_data, train_label)
    
    # record
    end_time = time.time()
    print("KNN_stun(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    print("KNN_stun(): ")
    
    # Print result of the best parameter
    print("    Best score: " + str(GS_CV.best_score_))
    print("    Best parameter: " + str(GS_CV.best_params_))
    print("    Best index: " + str(GS_CV.best_index_))
    
    return GS_CV.best_params_

#========================
# Find KNN hyper parameter 
#========================
KNN_best_para = KNN_stun(clean_train_data, train_label)


KNN_stun(): Start KNN stun...
KNN_stun(): time cost = 6.28 Minutes...
KNN_stun(): 
    Best score: 0.8644333333333334
    Best parameter: {'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'}
    Best index: 6


# 4.2 SVM

In [16]:
# SVM
from sklearn.svm import SVC

# SVM predict on test data
def svm(train_data, train_label, test_data, test_label, para):
    # record start time
    start_time = time.time()
    
    # build classifier and training
    svm = SVC(C = para['C'], kernel = para['kernel'], gamma = para['gamma'], max_iter = 2000)
    svm.fit(train_data,train_label)
    
    # testing
    result = svm.predict(test_data[0:2000])
    
    # record end time
    end_time = time.time()
    # record runing time
    print("svm(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    
    # individual model result
    score = accuracy_score(result[0:2000], test_label)
    print("svm(): test result: " + str(score) + "% \n")
    
    return result

In [14]:
#============================
# SVM Stuning hyperparameter
#============================

# find best SVM parameters use grid search with 10-fold stratified cross validation
def svm_stun(clean_train_data, train_label):
    print("=============================")
    print("svm_stun(): Start svm stun...")
    
    # Record start time
    start_time = time.time()
    
    # Only keep the parameter set for last stun 
    # select reasonable parameters and aggregate (C, gamma, kernel)
    param_grid = {'C': [2.7, 2.75, 2.775], 
             'gamma': [2.6, 2.7, 2.8] ,
             'kernel': ['rbf']}
    
    # cv as parameter is default stratified n fold cross validation 
    # maximum iteration = 1500 times
    GS_CV = GridSearchCV(SVC(max_iter = 1500),param_grid, cv = 10, refit=True)
    GS_CV.fit(clean_train_data, train_label)
    
    # Record finish time
    end_time = time.time()
    
    # calculate time cost
    print("svm_stun(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    print("svm_stun(): result: ...")
    
    # print result
    print("    Best score: " + str(GS_CV.best_score_))
    print("    Best parameter: " + str(GS_CV.best_params_))
    print("    Best index: " + str(GS_CV.best_index_))
    
    return GS_CV.best_params_

# find hyper parameter of svm
SVM_best_para = svm_stun(clean_train_data, train_label)


svm_stun(): Start svm stun...
svm_stun(): time cost = 38.68 Minutes...
svm_stun(): result: ...
    Best score: 0.9019999999999999
    Best parameter: {'C': 2.75, 'gamma': 2.8, 'kernel': 'rbf'}
    Best index: 2


# 4.3 Logistic Regression 

In [14]:
# Logistic regression

from sklearn.linear_model import LogisticRegression

def logistic_regression(clean_train_data, train_label, clean_test_data,test_label, para):
    # record start time
    start_time = time.time()
    
    # build classifier and training
    
    logi = LogisticRegression(multi_class="multinomial",penalty = para["penalty"], solver=para["solver"], max_iter=1500)
    logi.fit(train_data,train_label)
    
    # testing
    result = logi.predict(test_data)
    
    # record end time
    end_time = time.time()
    # record runing time
    print("logistic_regression(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    
    # individual model result
    score = accuracy_score(result[0:2000], test_label)
    print("logistic_regression(): test result: " + str(score) + "%\n")
    
    return result
    

In [19]:

# Find hyper parameters for logistic regression 
def logistic_regression_stun(clean_train_data, train_label):
    print("=============================================================")
    print("logistic_regression_stun(): Start logistic_regression stun...")
    
    start_time = time.time()
    # select possible and reasonable parameters
    penalty = ['l2', 'elasticnet'] #default=’l2’
    solver = ["newton-cg", "lbfgs", "sag", "saga"]
    
    # aggregate parameters (K, metric, weight)
    param = { 'penalty': penalty, 'solver': solver }
    
    # generate empty classifier
    logi = LogisticRegression(max_iter = 100000, n_jobs = 1)
    
    # cv as parameter is default stratified n fold cross validation 
    GS_CV = GridSearchCV(estimator=logi, param_grid=param, cv = 10, refit=True)
    GS_CV.fit(clean_train_data, train_label)
    
    # Record end time
    end_time = time.time()
    print("logistic_regression_stun(): time cost = " + str(round((end_time - start_time)/60, 2)) + " Minutes...")
    print("logistic_regression_stun(): ")
    
    # Print result of the best parameter
    print("    Best score: " + str(GS_CV.best_score_))
    print("    Best parameter: " + str(GS_CV.best_params_))
    print("    Best index: " + str(GS_CV.best_index_))
    
    return GS_CV.best_params_


# get best parameters
LR_best_para = logistic_regression_stun(clean_train_data, train_label)


logistic_regression_stun(): Start logistic_regression stun...
logistic_regression_stun(): time cost = 2.02 Minutes...
logistic_regression_stun(): 
    Best score: 0.8471666666666667
    Best parameter: {'penalty': 'l2', 'solver': 'lbfgs'}
    Best index: 1


# 5 Comparation of KNN, SVM, Logistic Reg performance
**Default HyperParameters from previous running :**<br>
Just in case if markers did not run the stun proccess in Part 4 which could use a lot of time, this part will use the hyperparameters which are obtained from previous running. Related information of these variables are shown below:<br>

**KNN hyper parameter** <br>
Variable name: default_KNN_best_para<br>
{'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'} <br>
<br>
**SVM hyper parameter** <br>
Variable name: default_SVM_best_para<br>
{'C': 3, 'gamma': 3.0, 'kernel': 'rbf'} <br>
<br>
**Logistic Regression hyper parameter** <br>
Variable name: default_LR_best_para<br>
{'penalty': 'l2', 'solver': 'lbfgs'} <br>
<br>

In [17]:
#===============
# compare model
#===============

# If marker did not run the stun proccess in Part 4 
# It will use the hyperparameters which are obtained from previous running

# Else, default best parameters are obtained from Part4 
# which are KNN_stun(), SVM()_stun, logistic_regression_stun()



# KNN best parameter from previous running
default_KNN_best_para = {'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'}

# SVM best parameter from previous running
default_SVM_best_para = {'C': 2, 'gamma': 2.8, 'kernel': 'rbf'}

# Logistic regression best parameter from previous running
default_LR_best_para = {'penalty': 'l2', 'solver': 'lbfgs'} 


# ==========================
# Start KNN
# ==========================

# if stun result is kept
if ('KNN_best_para' in locals().keys()):
    print("Start KNN with HyperParameters: " + str(KNN_best_para))
    KNN_result = KNN(clean_train_data, train_label, clean_test_data, test_label, KNN_best_para)
else:
    print("Start KNN with HyperParameters: " + str(default_KNN_best_para))
    KNN_result = KNN(clean_train_data, train_label, clean_test_data, test_label, default_KNN_best_para)
    

# ==========================
# Start SVM
# ==========================

# SVM algorithm and get 5000 predicted result
if ('SVM_best_para' in locals().keys()):
    print("Start SVM with HyperParameters: " + str(SVM_best_para))
    svm_result = svm(clean_train_data, train_label, clean_test_data, test_label, SVM_best_para)
else:
    print("Start SVM with HyperParameters: " + str(default_SVM_best_para))
    svm_result = svm(clean_train_data, train_label, clean_test_data, test_label, default_SVM_best_para)
    

# ==========================
# Start Logistic regression
# ==========================

if ('LR_best_para' in locals().keys()):
    print("Start Logistic regression with HyperParameters: " + str(LR_best_para))
    Logi_result = logistic_regression(clean_train_data, train_label, clean_test_data, test_label, LR_best_para)
else:
    print("Start Logistic regression with HyperParameters: " + str(default_LR_best_para))
    Logi_result = logistic_regression(clean_train_data, train_label, clean_test_data, test_label, default_LR_best_para)
    






Start KNN with HyperParameters: {'metric': 'minkowski', 'n_neighbors': 6, 'weights': 'distance'}
KNN(): time cost = 0.05 Minutes...
KNN(): test result: 85.55%

Start SVM with HyperParameters: {'C': 2, 'gamma': 2.8, 'kernel': 'rbf'}
svm(): time cost = 0.47 Minutes...
svm(): test result: 0.8845% 

Start Logistic regression with HyperParameters: {'penalty': 'l2', 'solver': 'lbfgs'}
logistic_regression(): time cost = 0.81 Minutes...
logistic_regression(): test result: 0.8395%



# 6. Output 

In [36]:
# Save output generated by best model 



# Path as: Output/predicted_labels.h5

# assume output is the predicted labels from classifiers
# (5000,)
with h5py.File('Output/predicted_labels.h5','w') as H:
    print(svm_result.shape)
    H.create_dataset('Output',data=svm_result)


(2000,)
