In [1]:
# -*- coding: utf-8 -*-
"""
Image Classification using Random Forest
"""

# setting up the data path
import os 


# Importing all the necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split


# Importing Train and Test datasets
train_data = pd.read_csv("fashion-mnist_train.csv")
final_test_data = pd.read_csv("fashion-mnist_test.csv")


# Splitting independent variables from the dependent variable in both training and testing
X_train = train_data.iloc[:,1:]
y_train = train_data.label.astype("str")

X_final_test = final_test_data.iloc[:,1:]
y_final_test = final_test_data.label.astype("str")



# Splitting train data into training and validation datasets
x_train, x_test, y_train_v, y_test_v = train_test_split(X_train,y_train, test_size = 0.3, random_state = 2)

# ================== Using Random Forest without hyper paramter tuning and clustering ===================
rf = RandomForestClassifier()

rf.fit(x_train,y_train_v)
# Predictions on training and validation
y_pred_train = rf.predict(x_train)
    # predictions for test
y_pred_test = rf.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = rf.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))

# Results:
#    86% accuracy on both validation and test datasets






Training metrics:
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      4163
          1       1.00      1.00      1.00      4211
          2       0.99      1.00      0.99      4211
          3       1.00      1.00      1.00      4182
          4       0.99      1.00      0.99      4243
          5       1.00      1.00      1.00      4203
          6       1.00      0.97      0.98      4214
          7       1.00      1.00      1.00      4135
          8       1.00      1.00      1.00      4222
          9       1.00      1.00      1.00      4216

avg / total       1.00      1.00      1.00     42000

Test data metrics:
             precision    recall  f1-score   support

          0       0.77      0.86      0.81      1837
          1       0.99      0.96      0.97      1789
          2       0.73      0.80      0.76      1789
          3       0.86      0.89      0.87      1818
          4       0.73      0.78      0.75      1757
     

In [2]:

# =========================== Using Grid Search for hyper parameter tuning ===================================
clf = GridSearchCV(rf, param_grid={'n_estimators':[100,200],'min_samples_leaf':[2,3]})
model = clf.fit(x_train,y_train_v)


y_pred_train = model.predict(x_train)
    # predictions for test
y_pred_test = model.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = model.predict(X_final_test)
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))

KeyboardInterrupt: 

In [2]:
# ==================== Using Clustering and hyper parameter tuning ============================
# K- means clustering
kmeans = KMeans(n_clusters=10, init='k-means++')

# fitting K means to X_train
kmeans.fit(X_train)
X_train["k_means_label"] = (kmeans.labels_)
X_train["k_means_label"] = X_train["k_means_label"].astype('str')

# Checking column type of K_means_label
X_train["k_means_label"].dtypes
X_train.k_means_label[0:10]
y_train[0:10]

# fitting K means to X_final_test
kmeans.fit(X_final_test)
X_final_test["k_means_label"] = (kmeans.labels_)
X_final_test["k_means_label"] = X_final_test["k_means_label"].astype('str')
y_final_test[0:10]

# Splitting train data into training and validation datasets
x_train, x_test, y_train_v, y_test_v = train_test_split(X_train,y_train, test_size = 0.3, random_state = 2)


    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))



Test data metrics:
             precision    recall  f1-score   support

          0       0.75      0.84      0.79      1000
          1       0.98      0.97      0.98      1000
          2       0.74      0.78      0.76      1000
          3       0.88      0.89      0.89      1000
          4       0.77      0.81      0.79      1000
          5       0.94      0.94      0.94      1000
          6       0.68      0.51      0.58      1000
          7       0.90      0.90      0.90      1000
          8       0.96      0.96      0.96      1000
          9       0.93      0.93      0.93      1000

avg / total       0.85      0.85      0.85     10000



In [None]:
# Hyper parameter tuning with new feature
clf = GridSearchCV(rf, param_grid={'n_estimators':[100,200],'min_samples_leaf':[2,3]})
model = clf.fit(x_train,y_train_v)

y_pred_train = model.predict(x_train)
    # predictions for test
y_pred_test = model.predict(x_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train_v, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_test_v, y_pred= y_pred_test))


# Predictions on testset
y_pred_test = model.predict(X_final_test)

In [None]:

# =================== Using 5 Fold Cross Validation to check the consistency of the final model ====================
sk_fold = StratifiedKFold(n_splits=5, shuffle=True)

for train_index, test_index in sk_fold.split(x_train, y_train_v):
    train = [x_train.iloc[i,:] for i in train_index]
    y_trn_k = [y_train_v.iloc[i] for i in train_index]
    test = [x_train.iloc[i,:] for i in test_index]
    y_tst_k = [y_train_v.iloc[i] for i in test_index]
    # predictions for train
    model.fit(train, y_trn_k)
    y_pred_train = model.predict(train)
    # predictions for test
    y_pred_test = model.predict(test)
    # training metrics
    print("Training metrics:")
    print(sklearn.metrics.classification_report(y_true= y_trn_k, y_pred= y_pred_train))
    
    # test data metrics
    print("Test data metrics:")
    print(sklearn.metrics.classification_report(y_true= y_tst_k, y_pred= y_pred_test))
    

# predictions on train
y_pred_train = model.predict(X_train)
    # predictions for test
y_pred_test = model.predict(X_final_test)
    # training metrics
print("Training metrics:")
print(sklearn.metrics.classification_report(y_true= y_train, y_pred= y_pred_train))
    
    # test data metrics
print("Test data metrics:")
print(sklearn.metrics.classification_report(y_true= y_final_test, y_pred= y_pred_test))
