In [None]:
"""IMPORTING NECESSARY LIBRARIES"""
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split,KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV


In [None]:
def read_data():
    
    """Reading the FASHION MNIST dataset from TensorFlow datasets"""
    data = tf.keras.datasets.fashion_mnist.load_data()
    
    """The data is already split into training and testing sets""" 
    train_set = data[0][0]
    train_label = data[0][1]
    test_set = data[1][0]
    test_label = data[1][1]

    return train_set,train_label,test_set,test_label

def reshape_data(data_set):
    """Reshaping the data from (N,28,28) to (N,784) where N is the number of images in the data (28 is the width of the image)"""
    
    new_data_set = []
    for i in data_set:
        newi = i.reshape(784)
        new_data_set.append(newi)
        
    return np.array(new_data_set)


def normalization(data_set):
    """ Normalizing Using min-max feature scaling"""
    
    scaler = preprocessing.MinMaxScaler()
    scaled_data = scaler.fit_transform(data_set)
    
    return scaled_data


def shuffle_data_train(data_set,data_label):
    """Shuffling the data to assure that the training/test sets are representative for the overall distribution of the data"""
    np.random.seed(42) #set the same random seed each time in order to trace back
    shuffle_index = np.random.permutation(48000) #shuffle the order of indexes and rearrange the lists
    x_data, x_label = data_set[shuffle_index], data_label[shuffle_index]
    
    return x_data,x_label

def shuffle_data_test(data_set,data_label):
    shuffle_index = np.random.permutation(10000)
    x_data, x_label = data_set[shuffle_index], data_label[shuffle_index]
    
    return x_data,x_label

def showImage(data):
    """ Testing the fashion data matching with the corresponding label after shuffling"""
    some_article = data
    some_article_image = some_article.reshape(28, 28) # Reshaping it to get the 28x28 pixels
    plt.imshow(some_article_image, cmap = matplotlib.cm.binary, interpolation="nearest")
    plt.axis("off")
    plt.show()

def prepare_data():
    """Final dataset with shuffling and reshaping"""
    X_train,y_train, X_test, y_test = read_data()
    X_train,y_train = shuffle_data_train(X_train,y_train) 
    X_test, y_test = shuffle_data_test(X_test, y_test)

    X_train = reshape_data(X_train)
    X_test = reshape_data(X_test)
    
    return X_train,y_train, X_test, y_test


In [None]:
def train_classifier(classifier_type,data_use):
    """Train the model with the chosen classifier"""
    
    if classifier_type == 'Decision Tree':
        clf = DecisionTreeClassifier(max_depth=50, random_state=42)
    if classifier_type == 'Random Forest':
        clf = RandomForestClassifier(n_estimators=100, max_depth=50, random_state=42)
    if classifier_type == 'Logistic':
        clf = LogisticRegression(multi_class="multinomial", solver="lbfgs",max_iter=2000, random_state=42)
        
    X_train,y_train, X_test, y_test = data_use
    
    kfold = model_selection.KFold(n_splits = 5)
    clf.fit(X_train, y_train)
    y_train_predict = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    
    results = cross_val_score(clf,X_train,y_train,cv=kfold)  
    print("Results of the first train for the %s classifier: %0.2f accuracy with a standard deviation of %0.2f" % (classifier_type, results.mean(), results.std()))
    return clf

In [None]:
def hyperparameter_opt(classifier_type,classifier_model, data_use):
    """HYPERPARAMETER OPTIMIZATION"""
    
    X_train,y_train, X_test, y_test = data_use
    if classifier_type == 'Decision Tree':
      
        params_use = {'criterion': ['gini', 'entropy'],
                     'max_depth': [1, 10, 50, 100],
                     'min_samples_split': [2, 3]}
        
    if classifier_type == 'Random Forest':
        
        params_use = { 'n_estimators' : [10, 100, 500, 1000]
                    'max_features' : ['auto','sqrt', 'log2']
                    'max_depth' : [10, 30, 50, 70, 90]}
    if classifier_type == 'Logistic':
        
        params_use = {"C" : [ 0.01, 0.1, 1,10,100],
                  "penalty" : ["l2"],
                  "solver" : ["newton-cg", "sag", "lbfgs"]}
        
    cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=2, random_state=1)
    grid = GridSearchCV(estimator=classifier_model, param_grid=params_use, cv=cv, scoring='accuracy',error_score=0,verbose = 5)
    gridfit = grid.fit(X_train, y_train)
    bestpar = list(gridfit.best_params_.values())
    
    return bestpar
    

In [None]:
def best_model(classifier_type, bestpar, data_use):
    """TRAIN THE MODEL WITH THE BEST PARAMETERS"""
    
    if classifier_type == 'Decision Tree':
        clf_best = DecisionTreeClassifier(criterion = bestpar[0],max_depth=bestpar[1],min_samples_split = bestpar[2],verbose=5, random_state=42)
       
    if classifier_type == 'Random Forest':
        clf_best = RandomForestClassifier(n_estimators=bestpar[2], max_features = bestpar[1], max_depth=bestpar[0], verbose = 5, random_state=42)

    if classifier_type == 'Logistic':
        clf_best = LogisticRegression(multi_class='multinomial', C = bestpar[0], penalty = bestpar[1], solver=bestpar[2],max_iter=500,verbose=5, random_state=42)
    
    X_train,y_train, X_test, y_test = data_use
    clf_best.fit(X_train, y_train)
    y_train_predict = clf_best.predict(X_train)
    y_test_pred_best = clf_best.predict(X_test)
    print(classification_report(y_test, y_test_pred_best)) #Classification Report of the metrics predicted vs expected
    return y_test_pred_best


In [None]:
def confusion_matrix(y_test_pred_best,data_use):
    """PLOTTING CONFUSION MATRIX - show the most mismatched labels"""
    
    X_train,y_train, X_test, y_test = data_use
    matrix = confusion_matrix(y_test, y_test_pred_best)
    labelNames = ["T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
    plt.figure(figsize=(20,10))
    plot_confusion_matrix(matrix, labelNames)
    ax = plt.gca()
    plt.title("Confusion Matrix for Decision Tree", fontsize = 20)
    ax.tick_params(axis='both', which='major', labelsize=20)
    ax.set_ylabel('True Label', fontsize=20)
    ax.set_xlabel('Predicted Label', fontsize=20)
    plt.show()

In [None]:
def run_program(classifier_use):
    """RUNNING THE WHOLE PROGRAM"""
    data_use = prepare_data()
    initial_model = train_classifier(classifier_type=classifier_use,data_use)
    bestparameters = hyperparameter_opt(classifier_type,classifier_model)
    y_test_pred_best = best_model(classifier_type, bestpar, data_use)
    confusion_matrix(y_test_pred_best,data_use)

In [None]:
"""CHOOSE FROM: Decision Tree, Random Forest, Logistic"""
classifier = 'Decision Tree'
run_program(classifier)