In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading Data

In [2]:
!curl http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz>cifar-10-python.tar.gz

curl: /opt/conda/lib/libcurl.so.4: no version information available (required by curl)
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  162M  100  162M    0     0  50.7M      0  0:00:03  0:00:03 --:--:-- 50.7M


In [3]:
!tar -xzvf cifar-10-python.tar.gz

cifar-10-batches-py/
cifar-10-batches-py/data_batch_4
cifar-10-batches-py/readme.html
cifar-10-batches-py/test_batch
cifar-10-batches-py/data_batch_3
cifar-10-batches-py/batches.meta
cifar-10-batches-py/data_batch_2
cifar-10-batches-py/data_batch_5
cifar-10-batches-py/data_batch_1


# Preparing Data

## Converting binary file to ndarrays

In [4]:
import pickle

def unpickle(file):
    with open(file, 'rb') as fo:
        data = pickle.load(fo, encoding='bytes')
    return data

def load_cifar10_data(data_dir):
    x_train = []
    y_train = []
    
    # Load training data
    for batch_id in range(1, 6):
        batch_data = unpickle(data_dir + f'data_batch_{batch_id}')
        x_train.append(batch_data[b'data'])
        y_train += batch_data[b'labels']

    # Load test data
    valid_data = unpickle(data_dir + 'test_batch')
    x_valid = valid_data[b'data']
    y_valid = valid_data[b'labels']
    
    # Convert to NumPy arrays
    x_train = np.vstack(x_train).reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1).astype('float')
    y_train = np.array(y_train)
    x_valid = x_valid.reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1).astype('float')
    y_valid = np.array(y_valid)
    
    return x_train, y_train, x_valid, y_valid

# Specify the path to the directory where you extracted the CIFAR-10 dataset
data_dir = "/kaggle/working/cifar-10-batches-py/"

x_train, y_train, x_valid, y_valid = load_cifar10_data(data_dir)


In [5]:
len(x_train)

50000

## Preparing training data and test data

In [6]:
from sklearn.model_selection import StratifiedShuffleSplit


# Specify the number of splits and the test size (e.g., 20%)
n_splits = 1  # Number of splits
test_size = 0.08  # Size of the test set (percentage)
X = x_train
y = y_train
# Create an instance of StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)

# Perform the split
for train_index, test_index in sss.split(X, y):
    x_train_1, x_test_1 = X[train_index], X[test_index]
    y_train_1, y_test_1 = y[train_index], y[test_index]


In [7]:
len(x_test_1)

4000

In [8]:
# Specify the number of splits and the test size (e.g., 20%)
n_splits = 1  # Number of splits
test_size = 0.2  # Size of the test set (percentage)
X = x_test_1
y = y_test_1
# Create an instance of StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=42)

# Perform the split
for train_index, test_index in sss.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]


In [9]:
len(x_train)

3200

# Data Preprocessing

## Normalization
Normalization is the process of converting the pixel intensity values to a normal state.

It follows a normal distribution.

A normalized image has mean = 0 and variance = 1

In [10]:
def normalize(data, eps=1e-8): 
    data -= data.mean(axis=(1, 2, 3), keepdims=True) 
    std = np.sqrt(data.var(axis=(1, 2, 3), ddof=1, keepdims=True)) # calculating standard deviation
    std[std < eps] = 1. 
    data /= std 
    return data 
# calling the function
train_data = normalize(x_train) 
test_data = normalize(x_test) 
# prints the shape of train data and test data 
print('train_data: ', train_data.shape)
print('test_data: ', test_data.shape)


train_data:  (3200, 32, 32, 3)
test_data:  (800, 32, 32, 3)


## ZCA Whitening
Normalization is followed by a ZCA whitening process.

The main aim of whitening is to reduce data redundancy, which means the features are less correlated and have the same variance.

ZCA stands for zero-phase component analysis. ZCA whitened images resemble the normal image.



In [11]:
# Computing whitening matrix 
train_data_flat = train_data.reshape(train_data.shape[0], -1).T
test_data_flat = test_data.reshape(test_data.shape[0], -1).T
print('train_data_flat: ', train_data_flat.shape)
print('test_data_flat: ', test_data_flat.shape)
train_data_flat_t = train_data_flat.T
test_data_flat_t = test_data_flat.T

train_data_flat:  (3072, 3200)
test_data_flat:  (3072, 800)


## Principle Component Analysis (PCA)

The major function of PCA is to decompose a multivariate dataset into a set of successive orthogonal components. These orthogonal components explain a maximum amount of the variance.

PCA is a dimensionality reduction technique.

The whitened data is given as the input to PCA.



In [12]:
from sklearn.decomposition import PCA
# n_components specify the no.of components to keep
n_components_train = min(train_data_flat.shape[0], train_data_flat.shape[1])
n_components_test = min(test_data_flat.shape[0], test_data_flat.shape[1])
train_data_pca = PCA(n_components=n_components_train).fit_transform(train_data_flat)
test_data_pca = PCA(n_components=n_components_test).fit_transform(test_data_flat)
train_data_pca = train_data_pca.T
test_data_pca = test_data_pca.T

In [13]:
train_data_pca.shape

(3072, 3072)

# Model Training for Classification

## Classification Algorithms
There are various algorithms to solve the classification problems.
Few of them are as follows:
- Support Vector Machine Classifier (SVM)
- Naive Bayes Classifier
- Stochastic Gradient Descent Classifier

## Support Vector Machine (SVM)

Support Vector Machine (SVM) is effective in:
- High-dimensional spaces.
- In cases, where, the number of dimensions > the number of samples.
- In cases with a clear margin of separation.

In [14]:
from sklearn import svm #Creating a svm classifier model

clf = svm.SVC(gamma=.001,probability=True) #Model training
clf.fit(train_data_flat_t, y_train) #After being fitted, the model can then be used to predict the output.

predicted=clf.predict(test_data_flat_t)
score= clf.score(test_data_flat_t,y_test) #classification score.
print("score",score)

score 0.39125


# Evaluation

## Confusion Matrix


Confusion Matrix is a technique used to evaluate the performance of a classifier.

It visually depicts the performance in a tabular form that has two dimensions namely, actual and predicted sets of data.

The rows and columns of the table show the count of false positives, false negatives, true positives, and true negatives.

The first parameter shows true values and the second parameter shows predicted values.

In [15]:
from sklearn import metrics

conf_matrix=metrics.confusion_matrix(y_test,predicted)
print(conf_matrix)

[[35 11  5  1  3  3  2  3 15  2]
 [ 1 40  0  5  3  3  2  7  4 15]
 [ 8  4 27  9  5  6 13  3  4  1]
 [ 2  3  5 18  2 27  8  6  2  7]
 [ 4  5 18  3 23  8 10  7  1  1]
 [ 1  4  4 16  6 29  9  6  0  5]
 [ 2  5 10  7 10  6 31  5  1  3]
 [ 2  5  7  7 13  9  4 26  2  5]
 [ 4  5  1  1  0 12  0  3 47  7]
 [ 3 19  1  5  0  4  3  1  7 37]]


**Here, the diagonal elements of the confusion matrix shows the number of correctly classified labels.**

## Classification Accuracy
Classification accuracy is defined as the percentage of correct predictions.

To calculate class wise accuracy,
         CA = (correctly predicted images of a class/(Total images of the class)) * 100

In [16]:

#To see the accuracy of each class. 
accuracy=[]

leng = len(conf_matrix) #finding the length of confusion matrix
for i in range(leng): 
#each diagonal element (conf_matrix[i,i]) is divided by the sum of the elements of that particular row (conf_matrix[i].sum()).
    ac=(conf_matrix[i,i]/((conf_matrix[i].sum())+.0000001))*100 
    accuracy.append(ac)

print(accuracy)


[43.7499999453125, 49.99999993750001, 33.749999957812506, 22.499999971875003, 28.7499999640625, 36.2499999546875, 38.7499999515625, 32.499999959375, 58.7499999265625, 46.2499999421875]



Overall accuracy is given by, OA = Sum of class-wise accuracy/no of classes


In [17]:

summation=0
no_of_classes = 10

for i in range(0,len(accuracy)):
    summation+=accuracy[i]

overall_accuracy = summation/no_of_classes
print(overall_accuracy)

39.124999951093756
