This analysis is for classification of collection of images of different objects such as pizza, sunflower, dalmatian, doller bill and soccer ball. The images are in 'jpg' format . 

In [5]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook
from sklearn import svm, metrics, datasets
from sklearn.utils import Bunch
from sklearn.model_selection import GridSearchCV, train_test_split

import skimage
from skimage.io import imread
from skimage.transform import resize

In [6]:
import pickle

In [7]:
def load_image_files(container_path, dimension=(64, 64)):
    """
    Load image files with categories as subfolder names 
    which performs like scikit-learn sample dataset
    
    Parameters
    ----------
    container_path : string or unicode
        Path to the main folder holding one subfolder per category
    dimension : tuple
        size to which image are adjusted to
        
    Returns
    -------
    Bunch
    """
    image_dir = Path(container_path)
    folders = [directory for directory in image_dir.iterdir() if directory.is_dir()]
    categories = [fo.name for fo in folders]

    descr = "A image classification dataset"
    images = []
    flat_data = []
    target = []
    for i, direc in enumerate(folders):
        for file in direc.iterdir():
            img = skimage.io.imread(file)
            img_resized = resize(img, dimension, anti_aliasing=True, mode='reflect')
            flat_data.append(img_resized.flatten()) 
            images.append(img_resized)
            target.append(i)
    flat_data = np.array(flat_data)
    target = np.array(target)
    images = np.array(images)

    return Bunch(data=flat_data,
                 target=target,
                 target_names=categories,
                 images=images,
                 DESCR=descr)

In [8]:
image_dataset = load_image_files("images2")

In [9]:
image_dataset

{'data': array([[0.58411746, 0.46254883, 0.38019589, ..., 0.66525735, 0.53851103,
         0.48010398],
        [0.9198223 , 0.92739258, 0.92034601, ..., 0.89019608, 0.91372549,
         0.90588235],
        [0.54910769, 0.64166667, 0.76985294, ..., 0.75551471, 0.0745098 ,
         0.08235294],
        ...,
        [0.25896523, 0.67526425, 0.86348039, ..., 0.06017157, 0.14658778,
         0.18249464],
        [0.03860294, 0.44644608, 0.65465686, ..., 0.54571078, 0.74963235,
         0.95061275],
        [0.01109069, 0.0973652 , 0.4346201 , ..., 0.39114583, 0.62677696,
         0.75316425]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1

In [10]:
## split the images data in to train and test set
X_train, X_test, y_train, y_test = train_test_split(
    image_dataset.data, image_dataset.target, test_size=0.3,random_state=109)

In [11]:
X_train.shape, X_test.shape

((216, 12288), (93, 12288))

In [12]:
## perform SVC and select the best hyper parameter. 
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid)
clf.fit(X_train, y_train)

GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}])

In [13]:
clf.best_estimator_

SVC(C=10, gamma=0.001)

In [14]:
## the best model with the parameter 

In [15]:
y_pred = clf.predict(X_test)

In [16]:
print("Classification report for - \n{}:\n{}\n".format(
    clf, metrics.classification_report(y_test, y_pred)))

Classification report for - 
GridSearchCV(estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
                         {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001],
                          'kernel': ['rbf']}]):
              precision    recall  f1-score   support

           0       0.72      0.81      0.76        16
           1       0.74      0.74      0.74        23
           2       0.62      0.71      0.67        14
           3       1.00      0.59      0.74        17
           4       0.85      0.96      0.90        23

    accuracy                           0.77        93
   macro avg       0.79      0.76      0.76        93
weighted avg       0.79      0.77      0.77        93




Pizza classification has the highest precision (less false positive), followed by that of Sunflower.Dalmatian has the least precision.        
Sunflower has the highest recall (less false negative)followed by dollarbill.  
F1 score of sunflower classification is the highest.    


In [18]:
metrics.confusion_matrix(y_test, y_pred)

array([[13,  2,  1,  0,  0],
       [ 2, 17,  4,  0,  0],
       [ 1,  3, 10,  0,  0],
       [ 1,  1,  1, 10,  4],
       [ 1,  0,  0,  0, 22]])

In [17]:
metrics.multilabel_confusion_matrix(y_test, y_pred)

array([[[72,  5],
        [ 3, 13]],

       [[64,  6],
        [ 6, 17]],

       [[73,  6],
        [ 4, 10]],

       [[76,  0],
        [ 7, 10]],

       [[66,  4],
        [ 1, 22]]])

In [24]:
## save the model so that the model can be used to  classify images uploaded through the web.

pickle.dump(clf, open('Images_classification_model.sav', 'wb'))
