<a href="https://colab.research.google.com/github/SebastianArriagadaS/unsupervised_ml/blob/main/Supervised_Image_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image classification using deep clustering
The purpose of the project is to use unsupervised image classification techniques on the German traffic sign dataset. A comparison between different state-of-the-art models is required regarding the accuracy, A theoretical comparison justifying the choice of the model is also acceptable.

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
import tensorflow as tf
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Conv2D,MaxPool2D,Dropout,Flatten,Dense,BatchNormalization

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
nb_class = 43

## Train

In [5]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(rescale=1/255.0, validation_split=0.2)

In [6]:
path = '/content/gdrive/MyDrive/Cours/MAM5A/ELTE/Advanced Machine Learning/Unsupervized Clustering on Image/data/'
df_train = pd.read_csv(path + 'Train.csv')
df_train.head() 

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,27,26,5,5,22,20,20,Train/20/00020_00000_00000.png
1,28,27,5,6,23,22,20,Train/20/00020_00000_00001.png
2,29,26,6,5,24,21,20,Train/20/00020_00000_00002.png
3,28,27,5,6,23,22,20,Train/20/00020_00000_00003.png
4,28,26,5,5,23,21,20,Train/20/00020_00000_00004.png


In [7]:
print(df_train.shape)
df_train = df_train[df_train['ClassId'] < nb_class]
print(df_train.shape)

(39209, 8)
(39209, 8)


In [8]:
df_train['Path']=df_train.Path.apply(lambda x: path + x)
df_train['ClassId']=df_train.ClassId.astype(str)

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39209 entries, 0 to 39208
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Width    39209 non-null  int64 
 1   Height   39209 non-null  int64 
 2   Roi.X1   39209 non-null  int64 
 3   Roi.Y1   39209 non-null  int64 
 4   Roi.X2   39209 non-null  int64 
 5   Roi.Y2   39209 non-null  int64 
 6   ClassId  39209 non-null  object
 7   Path     39209 non-null  object
dtypes: int64(6), object(2)
memory usage: 2.7+ MB


In [10]:
width, height = 50,50
trainDatagen = datagen.flow_from_dataframe(df_train, directory=None, x_col='Path', y_col='ClassId',
                                           target_size=(width,height), class_mode = 'categorical', batch_size = 16, 
                                           subset='training')

x, y = trainDatagen.next()
x.shape, y.shape

Found 31368 validated image filenames belonging to 43 classes.


((16, 50, 50, 3), (16, 43))

## Test

In [11]:
path = '/content/gdrive/MyDrive/Cours/MAM5A/ELTE/Advanced Machine Learning/Unsupervized Clustering on Image/data/'
df_test = pd.read_csv(path + 'Test.csv')
df_test.head() 

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,53,54,6,5,48,49,16,Test/00000.png
1,42,45,5,5,36,40,1,Test/00001.png
2,48,52,6,6,43,47,38,Test/00002.png
3,27,29,5,5,22,24,33,Test/00003.png
4,60,57,5,5,55,52,11,Test/00004.png


In [12]:
print(df_test.shape)
df_test = df_test[df_test['ClassId'] < nb_class]
print(df_test.shape)

(12630, 8)
(12630, 8)


In [13]:
df_test['Path']=df_test.Path.apply(lambda x: path + x)
df_test['ClassId']=df_test.ClassId.astype(str)

In [14]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39209 entries, 0 to 39208
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Width    39209 non-null  int64 
 1   Height   39209 non-null  int64 
 2   Roi.X1   39209 non-null  int64 
 3   Roi.Y1   39209 non-null  int64 
 4   Roi.X2   39209 non-null  int64 
 5   Roi.Y2   39209 non-null  int64 
 6   ClassId  39209 non-null  object
 7   Path     39209 non-null  object
dtypes: int64(6), object(2)
memory usage: 2.7+ MB


In [15]:
width, height = 50,50
testDatagen = datagen.flow_from_dataframe(df_test, directory=None, x_col='Path', y_col='ClassId',
                                           target_size=(width,height), class_mode = 'categorical', batch_size = 16, 
                                           subset='training')

x, y = testDatagen.next()
x.shape, y.shape

Found 10104 validated image filenames belonging to 43 classes.


((16, 50, 50, 3), (16, 43))

## Different Models

### Arbiratory CNN

In [16]:
from tensorflow.keras import models, layers

In [17]:
model = models.Sequential() #Sequential Model

#ConvLayer(64 filters) + MaxPooling + BatchNormalization + Dropout
model.add(layers.Conv2D(filters=32,kernel_size=3,activation='relu',padding='same',input_shape=(50, 50, 3)))
model.add(layers.MaxPool2D(strides=2))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

#Flatten
model.add(layers.Flatten())

#Dense layer with 1000 hidden units
model.add(layers.Dense(1000,activation='relu'))

#Softmax layer for output
model.add(layers.Dense(nb_class,activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 50, 50, 32)        896       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 25, 25, 32)       0         
 )                                                               
                                                                 
 batch_normalization (BatchN  (None, 25, 25, 32)       128       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 25, 25, 32)        0         
                                                                 
 flatten (Flatten)           (None, 20000)             0         
                                                                 
 dense (Dense)               (None, 1000)              2

In [None]:
model.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

history= model.fit(trainDatagen, epochs=20, batch_size=64,
                 validation_data=testDatagen)

Epoch 1/20


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline 
plt.plot(history.history["accuracy"])
plt.plot(history.history["val_accuracy"])
plt.title("Modelaccuracy")
plt.ylabel("Accuracy")
plt.xlabel("Epoch")
plt.legend(["Train","Test"],loc="upper left")
plt.show()

plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("Model loss")
plt.ylabel("loss")
plt.xlabel("Epoch")
plt.legend(["Train","Test"],loc="upper left")
plt.show()