In [339]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from random import shuffle
import os
import cv2
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm 
import seaborn as sns


np.set_printoptions(precision=5)#show 5 number after decimal point 

In [340]:
TRAIN_DIR = r"C:\Users\noa12\OneDrive\מסמכים\GitHub\Dogs_Vs_Cats_Project\train"
TEST_DIR = r"C:\Users\noa12\OneDrive\מסמכים\GitHub\Dogs_Vs_Cats_Project\test"
imgSize = 50

In [341]:
def label_img(img):
    word_label = img.split('.')[-3]
    if word_label == 'cat': return "0"
    elif word_label == 'dog': return "1"

In [342]:
def create_train_data():
    training_data = []
    for img in tqdm(os.listdir(TRAIN_DIR)):
        label = label_img(img)
        path = os.path.join("train\\",img)
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (imgSize,imgSize))
        training_data.append([np.array(img),np.array(label)])
    shuffle(training_data)
    np.save('train_data.npy', training_data)
    return training_data

In [343]:
def process_test_data():
    testing_data = []
    for img in tqdm(os.listdir(TEST_DIR)):
        path = os.path.join("test\\",img)
        img_num = img.split('.')[0]
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (imgSize,imgSize))
        testing_data.append([np.array(img), img_num])
        
    shuffle(testing_data)
    np.save('test_data.npy', testing_data)
    return testing_data

```python
train_data = create_train_data()
``` 

In [344]:
# If you have already created the dataset:
train_data = np.load('train_data.npy',allow_pickle=True)

In [345]:
X = np.array([i[0] for i in train_data])
Y = np.array([i[1] for i in train_data])

In [346]:
from sklearn.model_selection import train_test_split
#split the data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [362]:
X_train = X_train / 255.0
X_test = X_test / 255.0

In [363]:
y_test

array(['0', '1', '0', ..., '1', '0', '1'], dtype='<U1')

In [364]:
print("X_train.shape :" ,X_train.shape)
print("y_train.shape :", y_train.shape)
print ("X_test.shape :" , X_test.shape)
print("y_test.shape :", y_test.shape)

X_train.shape : (20000, 2500)
y_train.shape : (20000,)
X_test.shape : (5000, 2500)
y_test.shape : (5000,)


In [365]:
X_train = X_train.reshape((20000, 50*50))
X_test = X_test.reshape((5000,50*50))
print("X_train.shape :" ,X_train.shape)
print ("X_test.shape :" , X_test.shape)

X_train.shape : (20000, 2500)
X_test.shape : (5000, 2500)


In [366]:
class_names = {0:"cat", 1:"dog"}

In [367]:
#check that the data is balanced
unique, counts = np.unique(y_train, return_counts=True)
print("y_train :", dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print("y_test :", dict(zip(unique, counts)))

y_train : {'0': 10051, '1': 9949}
y_test : {'0': 2449, '1': 2551}


In [368]:
# Before we try real models, we will try a dummyClassifier. we will try to get high score than the dummyModel.
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=42)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)
print("The DummyClassifier score is:" ,accuracy_score(y_pred,y_test))

The DummyClassifier score is: 0.4898


In [369]:
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [370]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred = LR.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
#print("Classification Report: \n", metrics.classification_report(y_test, y_pred))

Accuracy: 0.5258


In [371]:
#KNN
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier().fit(X_train, y_train)
y_pred = KNN.predict(X_test)
precision = round(metrics.precision_score(y_test, y_pred, average="weighted"),3)            
accuracy = round(metrics.accuracy_score(y_test, y_pred),3)
print("precision_score:", precision)
print("Accuracy:", accuracy)
#print("Classification Report: \n", metrics.classification_report(y_test, y_pred))

precision_score: 0.476
Accuracy: 0.477


In [372]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

NB = GaussianNB().fit(X_train, y_train)
y_pred = NB.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
#print("Classification Report: \n", metrics.classification_report(y_test, y_pred))

Accuracy: 0.4898


In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state=42).fit(X_train, y_train)
y_pred = RF.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
#print("Classification Report: \n", metrics.classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix

cf_matrix = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6,4))
ax = sns.heatmap(cf_matrix, annot=True, fmt='', cmap='Greens')
ax.set_title('Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.xaxis.set_ticklabels(["cat","dog"])
ax.yaxis.set_ticklabels(["cat","dog"],rotation=0)

plt.show()

In [291]:
from xgboost import XGBClassifier

XGB = XGBClassifier()
XGB.fit(X_train, y_train,eval_metric='rmse')
y_pred = XGB.predict(X_test)

precision = round(metrics.precision_score(y_test, y_pred, average="weighted"),3)            
accuracy = round(metrics.accuracy_score(y_test, y_pred),3)
print("precision_score:", precision)
print("Accuracy:", accuracy)
#print("Classification Report: \n", metrics.classification_report(y_test, y_pred))

precision_score: 0.656
Accuracy: 0.656


In [None]:
print(pd.array(y_pred).value_counts())
print(pd.array(y_test).value_counts())

In [None]:
print('True:', y_test[0:15])
print('False:', y_pred[0:15])