In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split,KFold
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

In [None]:
def read_data():
    
    """Reading the FASHION MNIST dataset from TensorFlow datasets"""
    data = tf.keras.datasets.fashion_mnist.load_data()
    
    """The data is already split into training and testing sets""" 
    train_set = data[0][0]
    train_label = data[0][1]
    test_set = data[1][0]
    test_label = data[1][1]

    return train_set,train_label,test_set,test_label

def reshape_data(data_set):
    """Reshaping the data from (N,28,28) to (N,784) where N is the number of images in the data (28 is the width of the image)"""
    
    new_data_set = []
    for i in data_set:
        newi = i.reshape(784)
        new_data_set.append(newi)
        
    return np.array(new_data_set)


def normalization(data_set):
    """ Normalizing Using min-max feature scaling"""
    
    scaler = preprocessing.MinMaxScaler()
    scaled_data = scaler.fit_transform(data_set)
    
    return scaled_data


def shuffle_data_train(data_set,data_label):
    """Shuffling the data to assure that the training/test sets are representative for the overall distribution of the data"""
    np.random.seed(42) #set the same random seed each time in order to trace back
    shuffle_index = np.random.permutation(48000) #shuffle the order of indexes and rearrange the lists
    x_data, x_label = data_set[shuffle_index], data_label[shuffle_index]
    
    return x_data,x_label

def shuffle_data_test(data_set,data_label):
    shuffle_index = np.random.permutation(10000)
    x_data, x_label = data_set[shuffle_index], data_label[shuffle_index]
    
    return x_data,x_label

def showImage(data):
    """ Testing the fashion data matching with the corresponding label after shuffling"""
    some_article = data
    some_article_image = some_article.reshape(28, 28) # Reshaping it to get the 28x28 pixels
    plt.imshow(some_article_image, cmap = matplotlib.cm.binary, interpolation="nearest")
    plt.axis("off")
    plt.show()

def prepare_data():
    """Final dataset with shuffling and reshaping"""
    X_train,y_train, X_test, y_test = read_data()
    X_train,y_train = shuffle_data_train(X_train,y_train) 
    X_test, y_test = shuffle_data_test(X_test, y_test)

    X_train = reshape_data(X_train)
    X_test = reshape_data(X_test)
    
    return X_train,y_train, X_test, y_test

X_train,y_train,X_test,y_test = prepare_data()


In [None]:
"""DECISION TREE"""

from sklearn.tree import DecisionTreeClassifier

#Note: Scaling is not needed for Decision Tree algorithm

dec_tree_clf = DecisionTreeClassifier(max_depth=50, random_state=42)
dec_tree_clf.fit(X_train, y_train)
y_train_predict = dec_tree_clf.predict(X_train)
y_test_pred = dec_tree_clf.predict(X_test)
kfold = model_selection.KFold(n_splits = 5)
results = cross_val_score(dec_tree_clf,X_train,y_train,cv=kfold)
print("%0.2f accuracy with a standard deviation of %0.2f" % (results.mean(), results.std()))

"""Showing different classification metrics """
dec_tree_accuracy = accuracy_score(y_train, y_train_predict)
dec_tree_precision = precision_score(y_train, y_train_predict, average='weighted')
dec_tree_recall = recall_score(y_train, y_train_predict, average='weighted')
dec_tree_f1_score = f1_score(y_train, y_train_predict, average='weighted')

print("Decision Tree Accuracy: ", dec_tree_accuracy)
print("Decision Tree Precision: ", dec_tree_precision)
print("Decision Tree Recall: ", dec_tree_precision)
print("Decision Tree F1 Score: ", dec_tree_f1_score)

In [None]:
"""HYPERPARAMETER OPTIMIZATION through Grid Search"""
paramsuse = {'criterion': ['gini', 'entropy'],
             'max_depth': [1, 10, 50, 100],
             'min_samples_split': [2, 3]}

cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
grid = GridSearchCV(estimator=dec_tree_clf, param_grid=paramsuse, cv=cv, scoring='accuracy',error_score=0,verbose = 5)
gridfit = grid.fit(X_train, y_train)
bestpar = list(gridfit.best_params_.values())

In [None]:
"""Train using the best hyperparameters"""
dec_tree_clf_best = DecisionTreeClassifier(criterion = bestpar[0],max_depth=bestpar[1],min_samples_split = bestpar[2], random_state=42)
dec_tree_clf_best.fit(X_train, y_train)
y_train_predict = dec_tree_clf_best.predict(X_train)
y_test_pred_best = dec_tree_clf_best.predict(X_test)
print(classification_report(y_test, y_test_pred_best)) #Classification Report of the metrics

"""Plotting the confusion matrix"""
matrix = confusion_matrix(y_test, y_test_pred_best)
labelNames = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
plt.figure(figsize=(20,10))
plot_confusion_matrix(matrix, labelNames)
ax = plt.gca()
plt.title("Confusion Matrix for Decision Tree", fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize=20)
ax.set_ylabel('True Label', fontsize=20)
ax.set_xlabel('Predicted Label', fontsize=20)
plt.show()

In [None]:
"""RANDOM FOREST"""

rnd_clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=42)
rnd_clf.fit(X_train, y_train)
y_train_predict = rnd_clf.predict(X_train)
y_test_pred = rnd_clf.predict(X_test)

from sklearn import model_selection
kfold = model_selection.KFold(n_splits = 5)
results = cross_val_score(rnd_clf,X_train,y_train,cv=kfold)
print("%0.2f accuracy with a standard deviation of %0.2f" % (results.mean(), results.std()))

In [None]:
"""HYPERPARAMETERS OPTIMIZATION"""
n_estimators = [10, 100, 500, 1000]
max_features = ['auto','sqrt', 'log2']
max_depth = [10, 30, 50, 70, 90]
grid = dict(n_estimators=n_estimators,max_features=max_features,max_depth = max_depth)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
grid = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid = grid.fit(X, y)
grid.fit(X_train, y_train)
bestpar = list(gridfit.best_params_.values())

In [None]:
"""Train using the best hyperparameters"""
rnd_clf_best = RandomForestClassifier(n_estimators=bestpar[2], max_features = bestpar[1], max_depth=bestpar[0],verbose = 5, random_state=42)
rnd_clf_best.fit(X_train, y_train)
y_test_pred_best = rnd_clf_best.predict(X_test)
print(classification_report(y_test, y_test_pred_best))

"""Plotting the confusion matrix"""
matrix = confusion_matrix(y_test, y_test_pred_best)
labelNames = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
plt.figure(figsize=(20,10))
plot_confusion_matrix(matrix, labelNames)
ax = plt.gca()
plt.title("Confusion Matrix for Decision Tree", fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize=20)
ax.set_ylabel('True Label', fontsize=20)
ax.set_xlabel('Predicted Label', fontsize=20)
plt.show()

In [None]:
"""LOGISTIC REGRESSION"""

log_clf = LogisticRegression(multi_class="multinomial", solver="lbfgs",max_iter=2000)#, random_state=42)
# Note: using Softmax Regression (multi-class classification problem), 'C' is hyprparameter for regularizing L2, 'lbfgs' is Byoden-Fletcher-Goldfarb-Shanno(BFGS) algorithm
log_clf.fit(X_train, y_train)
y_test_pred = log_clf.predict(X_test)
kfold = model_selection.KFold(n_splits = 5)
results = cross_val_score(log_clf,X_train,y_train,cv=kfold)
print("%0.2f accuracy with a standard deviation of %0.2f" % (results.mean(), results.std()))

"""HYPERPARAMETER OPTIMIZATION"""

from sklearn.model_selection import GridSearchCV
params = {
    "C" : [ 0.01, 0.1, 1,10,100],
    "penalty" : ["l2"],
    "solver" : ["newton-cg", "sag", "lbfgs"]
}

grid = GridSearchCV(log_clf, param_grid=params, cv=5,verbose = 5)
grid.fit(X_train, y_train)
bestpar = list(gridfit.best_params_.values())

In [None]:
log_clf_best = LogisticRegression(multi_class='multinomial', C = bestpar[0], penalty = bestpar[1], solver=bestpar[2],max_iter=500,verbose=5, random_state=42)
log_clf_best.fit(X_train, y_train)
y_test_pred_best = log_clf_best.predict(X_test)

matrix = confusion_matrix(y_test, y_test_pred_best)
labelNames = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
plt.figure(figsize=(20,10))
plot_confusion_matrix(matrix, labelNames)
ax = plt.gca()
plt.title("Confusion Matrix for Logistic Regression",fontsize = 20)
ax.tick_params(axis='both', which='major', labelsize=20)
ax.set_ylabel('True Label', fontsize=20)
ax.set_xlabel('Predicted Label', fontsize=20)
plt.show()