In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from google.colab.patches import cv2_imshow
import cv2
from skimage.feature import hog
from sklearn import svm
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_directory = "/content/drive/MyDrive/Deep_learning"

Here we are extracting the features from our pictures using HOG, by resizing our image to 128 x 64, having 9 orientations, 8 x 8 pixels per cells and 2 x 2 cells per block, we get 3780 feature in total (excluding the label), so a column for each feature is made

In [4]:
columns=[]
for ctr in range(0,3780):
    columns.append(str(ctr))

In [5]:
categories = ["CatsTrain","DogsTrain","CatsTest","DogsTest"]

def create_data(category):
    path = os.path.join(data_directory,category)
    df = pd.DataFrame (columns=columns)
    for image in os.listdir(path):
        img = cv2.imread(os.path.join(path,image),cv2.IMREAD_GRAYSCALE)
        resized_img = cv2.resize(img,(128,64))
        fd,hog_img = hog(resized_img,orientations=9,pixels_per_cell=(8,8),cells_per_block=(2,2),visualize=True)
        df.loc[len(df)] = fd.tolist()
    return df


In [6]:
catsTrain = create_data(categories[0])
catsTrain['label'] = "cat"

In [7]:
dogsTrain = create_data(categories[1])
dogsTrain['label'] = "dog"

In [8]:
catsTest = create_data(categories[2])
catsTest['label'] = "cat"

In [9]:
dogsTest = create_data(categories[3])
dogsTest['label'] = "dog"

In [10]:
for data in (catsTrain, dogsTrain,catsTest,dogsTest):
   print("Data shape " , data.shape)

Data shape  (1000, 3781)
Data shape  (1000, 3781)
Data shape  (100, 3781)
Data shape  (100, 3781)


Merging each dataframe into its training/testing set

If the data is not randomized, the model will be biased to the first label that will exist in the first half of the dataset, then it will be biased to the second label of the second half of the dataset

In [None]:
def merge_ready_randomize():
  trainingSet = pd.DataFrame (columns=columns)
  testingSet = pd.DataFrame (columns=columns)

  trainingSet = pd.concat([catsTrain,dogsTrain],axis=0)
  testingSet = pd.concat([catsTest,dogsTest],axis=0)

  trainingSet['label'] = [1 if animal == 'cat' else 0 for animal in trainingSet['label']]
  testingSet['label'] = [1 if animal == 'cat' else 0 for animal in testingSet['label']]

  trainingSet = trainingSet.sample(frac=1,random_state = 42)
  testingSet = testingSet.sample(frac=1,random_state = 42)
  
  return trainingSet,testingSet

In [None]:
trainingSet, testingSet = merge_ready_randomize()

# There are multiple ways to select features
1. Pvalue
2. SelectKBest

# Why is feature selection necessary ?

In [54]:
for column in trainingSet.columns:
    if column != 'label':
        corr,pvalue = stats.pearsonr(trainingSet[column],trainingSet['label'])
        if pvalue >=0.05:
            trainingSet.drop(columns=[column],axis = 1,inplace=True)
            testingSet.drop(columns=[column],axis = 1,inplace=True) 

In [55]:
y_train = trainingSet['label']
x_train = trainingSet.drop(columns=['label'],axis=1)

In [56]:
y_test = testingSet['label']
x_test = testingSet.drop(columns=['label'],axis=1)

In [64]:
supportVectorMachines = {("Polynomial Kernel SVM",svm.SVC(kernel='poly')),
                         ("Linear Kernel SVM",svm.SVC(kernel='linear')),
                         ("RBF Kernel SVM",svm.SVC(kernel='rbf')),
                         ("Linear SVM",svm.LinearSVC())}

In [65]:
def trainSVMs(x_train,x_test,y_train,y_test):
   for name, model in supportVectorMachines:
     print("Training {modelName}".format(modelName = name))
     SVM = model.fit (x_train,y_train)
     training_error = accuracy_score(SVM.predict(x_train),y_train)
     testing_error = accuracy_score(SVM.predict(x_test),y_test)
     print("Training error is {training_error}".format(training_error=training_error))
     print("Testing error is {testing_error}".format(testing_error=testing_error))
     print('-'*45)
   print("Training an ensemble stacking classifier")
   classifier = StackingClassifier(estimators=supportVectorMachines,final_estimator=LogisticRegression(),cv=10).fit(x_train,y_train)
   print("Training error is {training_error}".format(training_error=training_error))
   print("Testing error is {testing_error}".format(testing_error=testing_error))
   print('-'*45)

In [66]:
trainSVMs(x_train,x_test,y_train,y_test)

Training Polynomial Kernel SVM
Training error is 1.0
Testing error is 0.735
---------------------------------------------
Training Linear SVM
Training error is 0.976
Testing error is 0.725
---------------------------------------------
Training Linear Kernel SVM
Training error is 0.919
Testing error is 0.73
---------------------------------------------
Training RBF Kernel SVM
Training error is 0.9485
Testing error is 0.725
---------------------------------------------
Training an ensemble stacking classifier
Training error is 0.9485
Testing error is 0.725
---------------------------------------------


In [None]:
selector = SelectKBest(chi2)

In [None]:
x_train = trainingSet.drop(columns=['label'],axis=1)
y_train = trainingSet['label']

In [None]:
selected_features = selector.fit_transform(x_train,y_train)

In [None]:
selected_features.shape

(2000, 10)

In [None]:
featureDiscarder = selector.get_support()

In [None]:
features = np.array(columns)

In [None]:
winningFeatures = features[featureDiscarder]

In [None]:
winningFeatures

array(['92', '587', '614', '879', '906', '987', '1014', '3473', '3491',
       '3626'], dtype='<U4')

In [None]:
for column in trainingSet:
    if column != 'label' :
        if column not in winningFeatures:
            trainingSet.drop(columns=[column],axis=1,inplace=True)
            testingSet.drop(columns=[column],axis=1,inplace=True)

In [None]:
y_train = trainingSet['label']
x_train = trainingSet.drop(columns=['label'],axis=1)

In [None]:
y_test = testingSet['label']
x_test = testingSet.drop(columns=['label'],axis=1)

In [None]:
svc = svm.SVC(kernel='linear').fit(x_train, y_train)
lin_svc = svm.LinearSVC().fit(x_train, y_train)
rbf_svc = svm.SVC(kernel='rbf').fit(x_train, y_train)
poly_svc = svm.SVC(kernel='poly').fit(x_train, y_train)

In [None]:
print(accuracy_score(svc.predict(x_test),y_test))
print(accuracy_score(lin_svc.predict(x_test),y_test))
print(accuracy_score(rbf_svc.predict(x_test),y_test))
print(accuracy_score(poly_svc.predict(x_test),y_test))

0.605
0.6
0.585
0.595


In [None]:
training_data =[]
def create_training_data():
    for category in categories:
        path = os.path.join(data_directory,category)
        class_index = categories.index(category)
        if class_index== 0 or class_index==2:
            class_index = 1
        else :
            class_index=0
        for images in os.listdir(path):
            try :
                img_array= cv2.imread(os.path.join(path,images),cv2.IMREAD_GRAYSCALE)
                new_image = cv2.resize(img_array,(128,64))
                training_data.append([new_image,class_index])
            except Exception as e:
                pass

In [None]:
create_training_data()

In [None]:
import random
random.shuffle(training_data)

In [None]:
X=[]
Y=[]
for features, labels in training_data:
    X.append(features)
    Y.append(labels)

In [None]:
X= np.array(X).reshape(-1,128,64,1)
Y = np.array(Y).reshape(2200,1)

In [None]:
X.shape

(2200, 128, 64, 1)

In [None]:
Y.shape

(2200, 1)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.1, random_state=42,stratify=Y)

In [None]:
x_train = x_train/255
x_test=x_test/255

In [None]:
import tensorflow as tf
import keras

In [None]:
model = tf.keras.models.Sequential([tf.keras.layers.Conv2D(32,(3,3),activation = 'relu',input_shape=(128,64,1)),
                                    tf.keras.layers.MaxPool2D(2,2),
                                    tf.keras.layers.Conv2D(64,(3,3),activation='relu'),
                                  tf.keras.layers.Flatten(),
                                  tf.keras.layers.Dense(64,activation='relu'),
                                  tf.keras.layers.Dense(8,activation='relu'),
                                   tf.keras.layers.Dense(1,activation='sigmoid')])

In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.01,momentum=0.8),loss = tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
model.fit(x_train,y_train,batch_size=32,epochs=50,verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f167c1da050>

In [None]:
model.evaluate(x_test,y_test)



[2.5243170261383057, 0.581818163394928]