In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import the train and test data

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [4]:
train_df.head()

Unnamed: 0,Image,Label
0,r40fgp34fag8uesz98fz.jpg,7
1,fy3h2r5zsea89vcy131g.jpg,4
2,3bxkbk15zjligtg4aymr.jpg,0
3,lygdwa0bio4fitq48n35.jpg,0
4,240le483tt2ebwtuki7d.jpg,0


In [5]:
test_df.head()

Unnamed: 0,Image,Label
0,kc0yq3aoedc1b0v05gwp.jpg,0
1,udxt71dqhpo3je6alnu3.jpg,0
2,n05xwwbvd8t0k2s6vxim.jpg,0
3,fe2udf0166l7e914798x.jpg,0
4,royn55uoa0oc330cmivt.jpg,0


In [7]:
train_df.Label.value_counts()

Label
4    3719
0    1519
3    1335
7    1236
2    1127
9    1122
8     989
1     719
5     678
6     268
Name: count, dtype: int64

In [8]:
# create train and validation data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_df.Image, 
                                                    train_df.Label, 
                                                    test_size=0.3, 
                                                    random_state=42, 
                                                    stratify=train_df.Label)

print("Shape of the X_train: ", X_train.shape)
print("Shape of the y_train: ", y_train.shape)
print("Shape of the X_test: ", X_test.shape)
print("Shape of the y_test: ", y_test.shape)

Shape of the X_train:  (8898,)
Shape of the y_train:  (8898,)
Shape of the X_test:  (3814,)
Shape of the y_test:  (3814,)


In [9]:
train_df_concat = pd.concat([X_train, y_train], axis = 1)
valid_df_concat = pd.concat([X_test, y_test], axis = 1)

print(train_df_concat)
print(valid_df_concat)

                          Image  Label
8717   m5ck4bguqh4g2fztahqd.jpg      5
6085   dp4rgofraohw80j4qym6.jpg      3
250    uuq0xmlbkykc4tndpupr.jpg      4
10365  v3pi554xr81yuu3ckrnu.jpg      0
3554   fqcljihfbvj72u0vptk0.jpg      7
...                         ...    ...
4417   q58qs4z5hgddmsw4pwnp.jpg      2
11998  7c66my688bcz5tw5cpq8.jpg      9
2793   zqr8d3m62lk03ijhq13d.jpg      4
773    oo7mq4gwqt9eoxw97q9m.jpg      3
5010   mk2hk6pbnk1m0dwxqlk6.jpg      8

[8898 rows x 2 columns]
                          Image  Label
9476   skey3g1zox7akwnf0tza.jpg      3
10710  2ljlb26p0sjqlfovyd01.jpg      4
3970   bz6p02mhz6syt67pyau0.jpg      4
7514   illk609tl7kz0082al6l.jpg      8
8960   ftkv3jqtic9m0o8p7zx1.jpg      6
...                         ...    ...
11146  hxz9efal5w2sfnoclwa9.jpg      3
12276  tccxpq7o32s5kp3g4f3i.jpg      5
10915  ow4ldjzey5ffm1hx5j3c.jpg      8
3447   2zvlzz9xu47s9jh52qn3.jpg      3
11711  kjlc7g054czhjhlpygne.jpg      4

[3814 rows x 2 columns]


In [12]:
print(train_df_concat.shape)
print(valid_df_concat.shape)

(8898, 2)
(3814, 2)


In [28]:
# Function to convert images to 1D vector
import os
from PIL import Image
from tqdm import tqdm

def function_1dVector(path, dataset):
    myblanklist = []
    for file in tqdm(dataset.Image):
        img = Image.open(os.path.join(path, file)).convert("L")
        im = img.resize((128, 128))
        image_array = np.array(im)
        image_array = 255 - image_array
        normalized_image = image_array / 255.0
        myblanklist.append(normalized_image.flatten())
    
    return myblanklist

In [30]:
train_1dData = function_1dVector('train', train_df_concat)

100%|██████████| 8898/8898 [00:25<00:00, 343.25it/s]


In [31]:
val_1dData = function_1dVector('train', valid_df_concat)

100%|██████████| 3814/3814 [00:06<00:00, 582.49it/s]
