In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn import metrics
from scipy.spatial import distance


In [0]:
#Implementing my own classifier which creates a template for each class in target column
#Each template stores the average value of each feature for that class
#The classifier will select the template that is similar to the query using a distance function

class TemplateMatch(BaseEstimator):
    #step 1: get all the target levels
    #step 2: get the average of the instance for each level
    #step 3: store it as a template for each level
    def __init__(self, dist_metric = 'euclidean'):
        self.dist_metric = dist_metric
    
    def fit(self, X, y):
        
        X, y = check_X_y(X, y)
        
        #get each target level
        unique = np.unique(y)
        self.classes_ = unique
        
        #append the target column to the data
        X = np.c_[X, y]
        
        #This is to find all the instances related to each target class
        #convert to dataframe for readability
        X = pd.DataFrame(X)
        
        #index for target column
        target = len(X.columns) -1
        
        #store avg of each column per row
        rows_list = []
        
        for i in unique:
          row = []
      
          #get all the instances that contains i class
          instances = X[X[target] == i]
          
          #calculate the average for each column in i class
          for j in X.columns:
            if( j != target):
              col_sum = sum(instances[j])
              avg = col_sum / len(instances)
              row.append(avg)
          rows_list.append(row)
          
        self.template_ = np.array(rows_list)
       
        return self
    
    #find the template that is most similar to the query instance using euclidean
    def predict_euclidean(self, X):
      
      check_is_fitted(self, ['template_'])
      
      X = check_array(X)
      predictions = []
      
      for instance in X:
        #first assign the target class 0 as the template similar to the query
        match = len(X) - 1
        dist = distance.euclidean(instance, self.template_[0])
        for i in range(10):
          if( distance.euclidean(instance, self.template_[i]) < dist):
            match = i
            dist = distance.euclidean(instance, self.template_[i])
        predictions.append(match)
      
      return predictions
    
    #find the template that is most similar to the query instance using manhattan
    def predict_manhattan(self, X):
      check_is_fitted(self, ['template_'])
      
      X = check_array(X)
      predictions = []
      
      for instance in X:
        #first assign the target class 0 as the template similar to the query
        match = 0
        dist = distance.cityblock(instance, self.template_[0])
        for i in range(10):
          if( distance.cityblock(instance, self.template_[i]) < dist):
            match = i
            dist = distance.cityblock(instance, self.template_[i])
        predictions.append(match)
      
      return predictions
    
    def predict_chebyshev(self, X):
      check_is_fitted(self, ['template_'])
      
      X = check_array(X)
      predictions = []
      
      for instance in X:
        #first assign the target class 0 as the template similar to the query
        match = 0
        dist = distance.chebyshev(instance, self.template_[0])
        for i in range(10):
          if( distance.chebyshev(instance, self.template_[i]) < dist):
            match = i
            dist = distance.cityblock(instance, self.template_[i])
        predictions.append(match)
      
      return predictions
    
    #To be used with GridSearch
    def predict(self, X):
      check_is_fitted(self, ['template_'])
      
      X = check_array(X)
      predictions = []
      
      if(self.dist_metric == 'euclidean'):
        dist_type = self.getEuclidDistance
      elif(self.dist_metric == 'manhattan'):
        dist_type = self.getManhattanDistance
      elif(self.dist_metric == 'chebyshev'):
        dist_type = self.getChebyshevDistance
      
      for instance in X:
        #first assign the target class 0 as the template similar to the query
        match = 0
        dist = dist_type(instance, self.template_[0])
        for i in range(10):
          if(dist_type(instance, self.template_[i]) < dist):
            match = i
            dist = dist_type(instance, self.template_[i])
        predictions.append(match)
      
      return predictions
    
    
    def getEuclidDistance(self,x, y):
      return distance.euclidean(x, y)
    
    def getManhattanDistance(self,x, y):
      return distance.cityblock(x, y)
    
    def getChebyshevDistance(self,x, y):
      return distance.chebyshev(x, y)
        
       
     

In [16]:
import io
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_data = pd.read_csv('/content/drive/My Drive/Dataset/fashion-mnist_train.csv')
test_data = pd.read_csv('/content/drive/My Drive/Dataset/fashion-mnist_test.csv')

In [0]:

target = "label"
y = train_data["label"]
X =train_data[train_data.columns.difference(["label"])]

In [0]:
my_model = TemplateMatch()
my_model.fit(X, y)

TemplateMatch(dist_metric='euclidean')

In [0]:
#Find the optimal distance metric using GrideSearch CV
param_grid = [
    {'dist_metric':['euclidean', 'manhattan', 'chebyshev']}
]

my_tuned_model = GridSearchCV(TemplateMatch(), param_grid, cv=2, verbose = 0,\
                              scoring = metrics.make_scorer(metrics.accuracy_score))

my_tuned_model.fit(X, y)

# Print details
print("Best parameters set found on development set:")
print(my_tuned_model.best_params_)
print(my_tuned_model.best_score_)



In [0]:
#creae a sample of test_data for fast testing
data_sampling_rate = 0.1
test_data = test_data.sample(frac=data_sampling_rate)


In [0]:
#predicting using euclidean distance
X_test = test_data[test_data.columns.difference(["label"])]
y_test = np.array(test_data["label"])
y_pred = my_model.predict_euclidean(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " + str(accuracy))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.225
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       110
           1       0.00      0.00      0.00       101
           2       0.29      0.08      0.12        93
           3       0.00      0.00      0.00       103
           4       0.00      0.00      0.00       108
           5       0.18      0.93      0.30        90
           6       0.21      0.27      0.24       109
           7       0.57      0.44      0.50        89
           8       0.22      0.72      0.33        92
           9       0.00      0.00      0.00       105

   micro avg       0.23      0.23      0.23      1000
   macro avg       0.15      0.24      0.15      1000
weighted avg       0.14      0.23      0.14      1000



  'precision', 'predicted', average, warn_for)


In [0]:
#predicting using manhattan
X_test = test_data[test_data.columns.difference(["label"])]
y_test = np.array(test_data["label"])
y_pred = my_model.predict_manhattan(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: " + str(accuracy))
print(metrics.classification_report(y_test, y_pred))

Accuracy: 0.283
              precision    recall  f1-score   support

           0       0.90      0.27      0.41        98
           1       0.00      0.00      0.00       102
           2       0.26      0.19      0.22        85
           3       0.00      0.00      0.00       100
           4       0.32      0.17      0.22       113
           5       0.14      0.61      0.22       101
           6       0.29      0.17      0.22        98
           7       0.41      0.87      0.55       100
           8       0.69      0.34      0.46       100
           9       0.33      0.21      0.26       103

   micro avg       0.28      0.28      0.28      1000
   macro avg       0.33      0.28      0.26      1000
weighted avg       0.33      0.28      0.26      1000



In [0]:
#predicting using chebyshev
X_test = test_data[test_data.columns.difference(["label"])]
y = np.array(test_data["label"])
y_pred = my_model.predict_chebyshev(X_test)
accuracy = metrics.accuracy_score(y, y_pred)
print("Accuracy: " + str(accuracy))
print(metrics.classification_report(y, y_pred))

Accuracy: 0.111
              precision    recall  f1-score   support

           0       0.29      0.09      0.14        98
           1       0.00      0.00      0.00       102
           2       0.00      0.00      0.00        85
           3       0.00      0.00      0.00       100
           4       0.00      0.00      0.00       113
           5       0.00      0.00      0.00       101
           6       0.00      0.00      0.00        98
           7       0.00      0.00      0.00       100
           8       0.00      0.00      0.00       100
           9       0.11      0.99      0.19       103

   micro avg       0.11      0.11      0.11      1000
   macro avg       0.04      0.11      0.03      1000
weighted avg       0.04      0.11      0.03      1000



  'precision', 'predicted', average, warn_for)
