<a href="https://colab.research.google.com/github/SebastianArriagadaS/unsupervised_ml/blob/main/Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import statistics as st
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
import cv2
import random

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import Conv2D,MaxPool2D,Dropout,Flatten,Dense,BatchNormalization

In [3]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
df_train=pd.read_csv("/content/drive/MyDrive/Deep Learning/data/Train.csv")
df_train['Path']=df_train.Path.apply(lambda x: "/content/drive/MyDrive/Deep Learning/data/"+x) ## change and normalize path
df_train['ClassId']=df_train.ClassId.astype(str)

# Trying with random data

In [9]:
##df_train_small=df_train.head(500) au cas ou 
c=random.sample((range(len(df_train))), 2000)
df_train_small=df_train.iloc[c,:]

In [11]:
def weighted_knn_classification(x_test, x_train, y_train,minlen,k=20):
  y_pred=pd.DataFrame(columns=["ClassId"])
  for i in x_test.index.to_list():
    dist=np.linalg.norm(x_train-x_test.loc[i],axis=1)
    indexs=dist.argsort()[:k]
    dist=np.sort(dist)[:k]
    weights=1./dist
    a=np.bincount(y_train.iloc[indexs],weights=weights,minlength=minlen)
    class_pred=a.argsort()[-1]
    y_pred.loc[i]=class_pred
  return y_pred

This is a weighted KNN classification, it means that it counts the K-nearest-neighbors of our data value, and add a "weight" to it : the distance. Then, the more a point is to our data point, the more chance it has to be counted in its class (and this is "logic" : if you are close to lot of points in the same class, there is a big chance that you are in the same class too).

In [12]:
def image_to_feature_vector(image, size=(32, 32)):
	# resize the image to a fixed size, then flatten the image into a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

In [13]:
def img_to_array(df):
  im = cv2.imread(df_train.iloc[0,7])
  df_im=image_to_feature_vector(im)
  for i in range(1,len(df)):
    im = cv2.imread(df_train_small.iloc[i,7]) ## read the image path
    im=image_to_feature_vector(im)
    df_im=np.vstack([df_im,im])
  return(df_im)

In [None]:
df_train_small
## data frame of random images flattened.

In [None]:
%%time 
#df_im=img_to_array(df_train_small) ## takes 2m30 for 500 images

In [None]:
#x_train_small, x_test_small, y_train_small, y_test_small = train_test_split(df_im,df_train_small.iloc[:,6],test_size=0.3)

In [None]:
#y_test_small=pd.DataFrame(y_test_small,columns=["ClassId"])
#y_test_small=y_test_small.astype(int)

In [None]:
#x_train_small=pd.DataFrame(x_train_small)
#x_test_small=pd.DataFrame(x_test_small)
#y_pred_small=weighted_knn_regression(x_test_small,x_train_small,y_train_small)

In [None]:
#y_test_small.index=range(len(y_test_small))

In [None]:
#abs(y_pred_small-y_test_small).value_counts()

As we can see, our classification function is not that good. The main reason could be that our "random" data images can disturb the way the algorithm is learning : the KNN classification is based on the distance and the class of a lot of points. If we take random values like in this case, there is a big chance that the data are **scattered**, so it could be difficult to have enough neighbors to have a good estimation of our data point.

# Trying with a reduce dataframe and less classes.

As we saw previously, the "random" way has poor results. Maybe of we take a reduced dataframe, wth 2000 data and less classes (let's say 6 instead of 42), we could have better results. 

In [14]:
df_test=pd.read_csv("/content/drive/MyDrive/Deep Learning/data/Test.csv")
df_test['Path']=df_test.Path.apply(lambda x: "/content/drive/MyDrive/Deep Learning/data/"+x) ## change and normalize path
df_test['ClassId']=df_test.ClassId.astype(str)

In [15]:
df_test["ClassId"].value_counts()

2     750
1     720
13    720
38    690
12    690
4     660
10    660
5     630
9     480
25    480
3     450
8     450
7     450
11    420
35    390
18    390
17    360
31    270
14    270
15    210
33    210
26    180
30    150
16    150
28    150
6     150
23    150
22    120
36    120
34    120
24     90
29     90
20     90
40     90
21     90
39     90
42     90
32     60
27     60
41     60
19     60
0      60
37     60
Name: ClassId, dtype: int64

In [None]:
df_train["ClassId"].value_counts()

Let's keep the classes 2, 1, 12, 13 and 38, because we have a lot of test values. As we just want for the moment to use a little data (to see if our classification model is "good", we will keep around 10% of our test dataframe, and keep around 2000 images from our train dataframe :

In [16]:
list_of_values=["1","2","12","13","38"]
df_train_small_2 = df_train[df_train['ClassId'].isin(list_of_values)]
df_test_small_2 = df_test[df_test['ClassId'].isin(list_of_values)]

In [17]:
df_train_small_2.index=range(len(df_train_small_2))
df_train_small_2

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,56,57,6,5,51,52,1,/content/drive/MyDrive/Deep Learning/data/Trai...
1,55,57,5,6,50,52,1,/content/drive/MyDrive/Deep Learning/data/Trai...
2,56,56,5,6,51,51,1,/content/drive/MyDrive/Deep Learning/data/Trai...
3,58,58,6,6,53,52,1,/content/drive/MyDrive/Deep Learning/data/Trai...
4,59,59,6,6,54,53,1,/content/drive/MyDrive/Deep Learning/data/Trai...
...,...,...,...,...,...,...,...,...
10795,72,72,7,7,66,66,38,/content/drive/MyDrive/Deep Learning/data/Trai...
10796,81,82,7,8,74,75,38,/content/drive/MyDrive/Deep Learning/data/Trai...
10797,87,93,7,9,80,85,38,/content/drive/MyDrive/Deep Learning/data/Trai...
10798,107,107,9,9,98,98,38,/content/drive/MyDrive/Deep Learning/data/Trai...


In [18]:
df_test_small_2.index=range(len(df_test_small_2))
df_test_small_2

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,42,45,5,5,36,40,1,/content/drive/MyDrive/Deep Learning/data/Test...
1,48,52,6,6,43,47,38,/content/drive/MyDrive/Deep Learning/data/Test...
2,52,56,5,5,47,51,38,/content/drive/MyDrive/Deep Learning/data/Test...
3,32,33,5,5,26,28,12,/content/drive/MyDrive/Deep Learning/data/Test...
4,38,37,6,5,33,32,12,/content/drive/MyDrive/Deep Learning/data/Test...
...,...,...,...,...,...,...,...,...
3565,64,64,6,6,59,59,38,/content/drive/MyDrive/Deep Learning/data/Test...
3566,34,34,5,5,29,29,1,/content/drive/MyDrive/Deep Learning/data/Test...
3567,63,57,6,5,58,52,13,/content/drive/MyDrive/Deep Learning/data/Test...
3568,35,35,6,5,29,30,12,/content/drive/MyDrive/Deep Learning/data/Test...


We have around 25% test and 75% train data here. Our training dataframe has around 9000 data, so it's still long to compute (around 20 min for 2000 data). Let's say we keep 20% of our data first.

In [None]:
df_train_small_2["ClassId"].value_counts()/5

In [19]:
class1_count=0
class2_count=0
class12_count=0
class13_count=0
class38_count=0
df_train_smaller=pd.DataFrame()
for i in range(len(df_train_small_2)):
  if df_train_small_2.iloc[i,6]=="1" and class1_count < 450:
    df_train_smaller=df_train_smaller.append(df_train_small_2.iloc[i,:])
    class1_count=class1_count+1
  if df_train_small_2.iloc[i,6]=="2" and class2_count < 444:
    df_train_smaller=df_train_smaller.append(df_train_small_2.iloc[i,:])
    class2_count=class2_count+1
  if df_train_small_2.iloc[i,6]=="12" and class12_count < 432:
    df_train_smaller=df_train_smaller.append(df_train_small_2.iloc[i,:])
    class12_count=class12_count+1
  if df_train_small_2.iloc[i,6]=="13" and class13_count < 420:
    df_train_smaller=df_train_smaller.append(df_train_small_2.iloc[i,:])
    class13_count=class13_count+1
  if df_train_small_2.iloc[i,6]=="38" and class38_count < 414:
    df_train_smaller=df_train_smaller.append(df_train_small_2.iloc[i,:])
    class38_count=class38_count+1

In [20]:
df_train_smaller.index=range(len(df_train_smaller))
df_train_smaller

Unnamed: 0,Width,Height,Roi.X1,Roi.Y1,Roi.X2,Roi.Y2,ClassId,Path
0,56.0,57.0,6.0,5.0,51.0,52.0,1,/content/drive/MyDrive/Deep Learning/data/Trai...
1,55.0,57.0,5.0,6.0,50.0,52.0,1,/content/drive/MyDrive/Deep Learning/data/Trai...
2,56.0,56.0,5.0,6.0,51.0,51.0,1,/content/drive/MyDrive/Deep Learning/data/Trai...
3,58.0,58.0,6.0,6.0,53.0,52.0,1,/content/drive/MyDrive/Deep Learning/data/Trai...
4,59.0,59.0,6.0,6.0,54.0,53.0,1,/content/drive/MyDrive/Deep Learning/data/Trai...
...,...,...,...,...,...,...,...,...
2155,38.0,40.0,5.0,6.0,33.0,34.0,38,/content/drive/MyDrive/Deep Learning/data/Trai...
2156,41.0,41.0,5.0,5.0,36.0,36.0,38,/content/drive/MyDrive/Deep Learning/data/Trai...
2157,43.0,44.0,5.0,6.0,38.0,39.0,38,/content/drive/MyDrive/Deep Learning/data/Trai...
2158,46.0,47.0,6.0,6.0,41.0,42.0,38,/content/drive/MyDrive/Deep Learning/data/Trai...


In [21]:
def img_to_array(df):
  im = cv2.imread(df.iloc[0,7])
  df_im=image_to_feature_vector(im)
  for i in range(1,len(df)):
    im = cv2.imread(df.iloc[i,7]) ## read the image path
    im=image_to_feature_vector(im)
    df_im=np.vstack([df_im,im])
  return(df_im)

In [22]:
%%time
df_im2=img_to_array(df_train_smaller) 

CPU times: user 7.43 s, sys: 1.62 s, total: 9.05 s
Wall time: 9min 3s


In [23]:
df_im2.shape

(2160, 3072)

In [24]:
df_test_small_2["ClassId"].value_counts()/5

2     150.0
1     144.0
13    144.0
38    138.0
12    138.0
Name: ClassId, dtype: float64

In [25]:
class1_count=0
class2_count=0
class12_count=0
class13_count=0
class38_count=0
df_test_smaller=pd.DataFrame()
for i in range(len(df_test_small_2)):
  if df_test_small_2.iloc[i,6]=="1" and class1_count < 150:
    df_test_smaller=df_test_smaller.append(df_test_small_2.iloc[i,:])
    class1_count=class1_count+1
  if df_test_small_2.iloc[i,6]=="2" and class2_count < 144:
    df_test_smaller=df_test_smaller.append(df_test_small_2.iloc[i,:])
    class2_count=class2_count+1
  if df_test_small_2.iloc[i,6]=="12" and class12_count < 138:
    df_test_smaller=df_test_smaller.append(df_test_small_2.iloc[i,:])
    class12_count=class12_count+1
  if df_test_small_2.iloc[i,6]=="13" and class13_count < 144:
    df_test_smaller=df_test_smaller.append(df_test_small_2.iloc[i,:])
    class13_count=class13_count+1
  if df_test_small_2.iloc[i,6]=="38" and class38_count < 138:
    df_test_smaller=df_test_smaller.append(df_test_small_2.iloc[i,:])
    class38_count=class38_count+1

In [26]:
df_test_smaller.index=range(len(df_test_smaller))

In [27]:
%%time
df_im2_test=img_to_array(df_test_smaller) 

CPU times: user 1.77 s, sys: 443 ms, total: 2.22 s
Wall time: 2min 42s


In [28]:
y_train2=df_train_smaller["ClassId"]
y_test2=df_test_smaller["ClassId"]

In [29]:
x_train2=pd.DataFrame(df_im2)
x_test2=pd.DataFrame(df_im2_test)
y_pred2=weighted_knn_classification(x_test2,x_train2,y_train2,5)

In [30]:
y_test2=pd.DataFrame(y_test2,columns=["ClassId"])
y_test2=y_test2.astype(int)
y_test2

Unnamed: 0,ClassId
0,1
1,38
2,38
3,12
4,12
...,...
709,1
710,1
711,1
712,1


In [31]:
abs(y_pred2-y_test2).value_counts()/len(y_pred2)

ClassId
0          0.406162
1          0.154062
11         0.089636
26         0.088235
36         0.061625
37         0.061625
12         0.057423
25         0.056022
10         0.025210
dtype: float64

In [35]:
y_pred2.value_counts()

ClassId
1          220
38         207
13         120
12          90
2           77
dtype: int64