In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from google.colab.patches import cv2_imshow
import cv2
from skimage.feature import hog
from sklearn import svm
from scipy import stats
from sklearn.metrics import accuracy_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_directory = "/content/drive/MyDrive/Deep_learning"

Here we are extracting the features from our pictures using HOG, by resizing our image to 128 x 64, having 9 orientations, 8 x 8 pixels per cells and 2 x 2 cells per block, we get 3780 feature in total (excluding the label), so a column for each feature is made

In [4]:
columns=[]
for ctr in range(0,3780):
    columns.append(str(ctr))

In [5]:
categories = ["CatsTrain","DogsTrain","CatsTest","DogsTest"]

def create_data(category):
    path = os.path.join(data_directory,category)
    df = pd.DataFrame (columns=columns)
    for image in os.listdir(path):
        img = cv2.imread(os.path.join(path,image),cv2.IMREAD_GRAYSCALE)
        resized_img = cv2.resize(img,(128,64))
        fd,hog_img = hog(resized_img,orientations=9,pixels_per_cell=(8,8),cells_per_block=(2,2),visualize=True)
        df.loc[len(df)] = fd.tolist()
    return df


In [6]:
def merge_ready_randomize():
    training_set = pd.DataFrame (columns=columns)
    testing_set = pd.DataFrame (columns=columns)

    training_set = pd.concat([catsTrain,dogsTrain],axis=0)
    testing_set = pd.concat([catsTest,dogsTest],axis=0)

    training_set['label'] = [1 if animal == 'cat' else 0 for animal in training_set['label']]
    testing_set['label'] = [1 if animal == 'cat' else 0 for animal in testing_set['label']]

    trainingSet = training_set.sample(frac=1,random_state = 26)
    testing_set = testing_set.sample(frac=1,random_state = 26)
  
    return training_set,testing_set

In [7]:
def split_data():
    y_train = training_set['label']
    x_train = training_set.drop(columns=['label'],axis=1)
    y_test = testing_set['label']
    x_test = testing_set.drop(columns=['label'],axis=1)
    return x_train, x_test, y_train,y_test

In [8]:
def trainSVMs(x_train,x_test,y_train,y_test):
    for name, model in supportVectorMachines:
        print("Training {modelName}".format(modelName = name))
        SVM = model.fit (x_train,y_train)
        training_error = accuracy_score(SVM.predict(x_train),y_train)
        testing_error = accuracy_score(SVM.predict(x_test),y_test)
        print("Training error is {training_error}".format(training_error=training_error))
        print("Testing error is {testing_error}".format(testing_error=testing_error))
        print('-'*45)
    print("Training an ensemble stacking classifier")
    classifier = StackingClassifier(estimators=supportVectorMachines,final_estimator=LogisticRegression(),cv=10).fit(x_train,y_train)
    print("Training error is {training_error}".format(training_error=training_error))
    print("Testing error is {testing_error}".format(testing_error=testing_error))
    print('-'*45)

In [9]:
image_dataset =[]
def create_image_dataset():
    for category in categories:
        path = os.path.join(data_directory,category)
        class_index = categories.index(category)
        if class_index== 0 or class_index==2:
            class_index = 1
        else :
            class_index=0
        for images in os.listdir(path):
            try :
                img_array= cv2.imread(os.path.join(path,images),cv2.IMREAD_GRAYSCALE)
                resized_image = cv2.resize(img_array,(128,64))
                image_dataset.append([resized_image,class_index])
            except Exception as e:
                pass

In [10]:
def pvalue_feature_selection():
    for column in training_set.columns:
        if column != 'label':
            corr,pvalue = stats.pearsonr(training_set[column],training_set['label'])
            if pvalue >=0.05:
                training_set.drop(columns=[column],axis = 1,inplace=True)
                testing_set.drop(columns=[column],axis = 1,inplace=True) 

In [11]:
def select_k_best_feature_selection(x_train,y_train):
    selector = SelectKBest(chi2)
    selected_features = selector.fit_transform(x_train,y_train)
    print(selected_features.shape)
    filter = selector.get_support()
    features = np.array(columns)
    winning_features = features[filter]
    print("Best 10 features are {features}".format(features = winning_features))
    for column in training_set:
        if column != 'label' :
            if column not in winning_features:
                training_set.drop(columns=[column],axis=1,inplace=True)
                testing_set.drop(columns=[column],axis=1,inplace=True) 

In [12]:
catsTrain = create_data(categories[0])
catsTrain['label'] = "cat"

In [13]:
dogsTrain = create_data(categories[1])
dogsTrain['label'] = "dog"

In [14]:
catsTest = create_data(categories[2])
catsTest['label'] = "cat"

In [15]:
dogsTest = create_data(categories[3])
dogsTest['label'] = "dog"

In [16]:
for data in (catsTrain, dogsTrain,catsTest,dogsTest):
    print("Data shape " , data.shape)

Data shape  (1000, 3781)
Data shape  (1000, 3781)
Data shape  (100, 3781)
Data shape  (100, 3781)


Merging each dataframe into its training/testing set

If the data is not randomized, the model will be biased to the first label that will exist in the first half of the dataset, then it will be biased to the second label that exist in the second half of the dataset

In [17]:
training_set, testing_set = merge_ready_randomize()

# There are multiple ways to select features
1. Pvalue
2. SelectKBest

# Why is feature selection necessary ?
 According to LaGrange interpolation techniques for finding a unique polynomial that 
passes through the observations, if the number of features (estimators/predictors) was 
more than the number of observations (data points), then the predictive line is not 
unique and the model is prone to overfitting, and this is the case here, 3780 features
and 2000 observations, so some features had to be removed 

1. Pvalue

In [18]:
pvalue_feature_selection()

In [19]:
x_train, x_test, y_train,y_test = split_data()

In [20]:
supportVectorMachines = {("Polynomial Kernel SVM",svm.SVC(kernel='poly')),
                         ("Linear Kernel SVM",svm.SVC(kernel='linear')),
                         ("RBF Kernel SVM",svm.SVC(kernel='rbf')),
                         ("Linear SVM",svm.LinearSVC())}

In [21]:
trainSVMs(x_train,x_test,y_train,y_test)

Training Polynomial Kernel SVM
Training error is 1.0
Testing error is 0.735
---------------------------------------------
Training Linear Kernel SVM
Training error is 0.9185
Testing error is 0.73
---------------------------------------------
Training RBF Kernel SVM
Training error is 0.9485
Testing error is 0.725
---------------------------------------------
Training Linear SVM
Training error is 0.976
Testing error is 0.725
---------------------------------------------
Training an ensemble stacking classifier
Training error is 0.976
Testing error is 0.725
---------------------------------------------


2. SelectKBest

In [22]:
training_set, testing_set = merge_ready_randomize()

In [23]:
x_train, x_test, y_train,y_test = split_data()

In [24]:
select_k_best_feature_selection(x_train,y_train)

(2000, 10)
Best 10 features are ['92' '587' '614' '879' '906' '987' '1014' '3473' '3491' '3626']


In [25]:
trainSVMs(x_train,x_test,y_train,y_test)

Training Polynomial Kernel SVM
Training error is 1.0
Testing error is 0.705
---------------------------------------------
Training Linear Kernel SVM
Training error is 0.997
Testing error is 0.655
---------------------------------------------
Training RBF Kernel SVM
Training error is 0.965
Testing error is 0.745
---------------------------------------------
Training Linear SVM
Training error is 1.0
Testing error is 0.675
---------------------------------------------
Training an ensemble stacking classifier
Training error is 1.0
Testing error is 0.675
---------------------------------------------


Trying a CNN, but it is prone to overfitting due to the very small size of the dataset

In [26]:
create_image_dataset()

In [27]:
import random
random.shuffle(image_dataset)

In [28]:
X=[]
Y=[]
for features, labels in image_dataset:
    X.append(features)
    Y.append(labels)

In [29]:
X= np.array(X).reshape(-1,128,64,1)
Y = np.array(Y).reshape(2200,1)

In [30]:
X.shape

(2200, 128, 64, 1)

In [31]:
Y.shape

(2200, 1)

In [32]:
x_train,x_test,y_train,y_test= train_test_split(X,Y,test_size=0.1, random_state=42,stratify=Y)

In [33]:
x_train = x_train/255
x_test=x_test/255

In [34]:
model = tf.keras.models.Sequential([tf.keras.layers.Conv2D(32,(3,3),activation = 'relu',input_shape=(128,64,1)),
                                    tf.keras.layers.MaxPool2D(2,2),
                                    tf.keras.layers.Conv2D(64,(3,3),activation='relu'),
                                  tf.keras.layers.Flatten(),
                                  tf.keras.layers.Dense(64,activation='relu'),
                                  tf.keras.layers.Dense(8,activation='relu'),
                                   tf.keras.layers.Dense(1,activation='sigmoid')])

In [35]:
model.compile(optimizer=tf.keras.optimizers.SGD(lr=0.01,momentum=0.8),loss = tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [36]:
model.fit(x_train,y_train,batch_size=32,epochs=30,verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f508e27ca10>

In [37]:
model.evaluate(x_test,y_test)



[1.5957133769989014, 0.5]