# Creating Dataset

This notebook will download the dataset to train our model. Then, it will store the model weights on Google Cloud Storage (GCS)

## Configuring Google Cloud project on this notebook



In [1]:
from google.colab import auth
auth.authenticate_user()

Updated property [core/project].


## Downloading the dataset from GCS

In [2]:
!pwd

/content


In [None]:
!gsutil cp gs://judgebook/judgebook-dataset-imgs_sheets.tar.gz .
!tar xzvf judgebook-dataset-imgs_sheets.tar.gz

In [9]:
judgebook_dir="/content/judgebook"
!head {judgebook_dir}/dataset.csv

filename,ambitious,brave,cheerful,confiden,elegan,generous,loyal,sharp,shy,strong,wise,witty,compassionate,competitive,creative,effective,energetic,enthusiastic,hardworking,intelligen,kind,reliable,responsible,sympathetic,ctive,nalytical,calm,charming,curious,determined,diligen,friendly,hones,lovely,passionate,playful
35079884_1994-08-07_2012.jpg,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4478357_1945-01-12_1972.jpg,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1439517_1986-06-03_2015.jpg,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35336978_1990-11-28_2011.jpg,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.

## Installing dependencies

In [11]:
!pip install opencv-python



In [12]:
import os
import re
import PIL
import tarfile
import numpy as np
import pandas as pd
from datetime import datetime
from shutil import copy as copy_file
from tqdm import tqdm_notebook as tqdm
from IPython.display import Image, display
from keras.models import Model, load_model
from keras.layers import Dense, Flatten
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.applications.vgg19 import VGG19, preprocess_input
from tensorflow.keras.utils import load_img, img_to_array
# Constants
from PIL import Image
import cv2

imgs_folder_path = f"{judgebook_dir}/dataset/faces/images"

In [14]:
with tarfile.open(f'{judgebook_dir}/images/wiki_crop.tar') as tar:
  tar.extractall(path=f'{judgebook_dir}/dataset/faces')

## Dataset processing

In [15]:
exp_count = 0

for _rootdir, _, _files in tqdm(os.walk(f"{judgebook_dir}/dataset/faces/wiki_crop")):
  for _file in _files:
    try:
      os.rename(os.path.join(_rootdir, _file), os.path.join(imgs_folder_path, _file))
    except:
      exp_count+=1
      continue
print("Exceptions encountered: {}".format(exp_count))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _rootdir, _, _files in tqdm(os.walk(f"{judgebook_dir}/dataset/faces/wiki_crop")):


0it [00:00, ?it/s]

Exceptions encountered: 0


In [16]:
print('Number of images: {}'.format(len(os.listdir(imgs_folder_path))))

Number of images: 62329


In [17]:
list_of_files = []
list_of_birthyears = []
for _file in tqdm(os.listdir(imgs_folder_path)):
  try:
    # Example:- 23300_1962-06-19_2011.jpg --> Split: ["23300", "1964-06-19", "2011.jpg"]
    file_name = _file.split("_")
    # Date of birth of the person
    birthyear = file_name[1].split("-")[0]

    list_of_files.append(_file)
    list_of_birthyears.append(birthyear)

  except:
      # If any error, remove the image
      os.remove(os.path.join(imgs_folder_path, _file))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for _file in tqdm(os.listdir(imgs_folder_path)):


  0%|          | 0/62329 [00:00<?, ?it/s]

In [18]:
print(f"Number of files: {len(list_of_files)}")

Number of files: 62328


In [19]:
dict_images =dict(zip(list_of_files, list_of_birthyears))

len(dict_images)

62328

In [20]:
!ls -ltrh {imgs_folder_path}/ | head

total 857M
-rw------- 1 3059 73504  20K Feb 21  2016 100039_1904-12-07_1982.jpg
-rw------- 1 3059 73504 8.2K Feb 21  2016 10003541_1937-09-27_1971.jpg
-rw------- 1 3059 73504 8.3K Feb 21  2016 10002702_1960-11-09_2012.jpg
-rw------- 1 3059 73504 5.7K Feb 21  2016 10002116_1971-05-31_2012.jpg
-rw------- 1 3059 73504  15K Feb 21  2016 10001965_1930-05-23_1961.jpg
-rw------- 1 3059 73504  24K Feb 21  2016 100012_1948-07-03_2008.jpg
-rw------- 1 3059 73504 8.5K Feb 21  2016 10000548_1925-04-04_1964.jpg
-rw------- 1 3059 73504  13K Feb 21  2016 10000217_1981-05-05_2009.jpg
-rw------- 1 3059 73504 2.1K Feb 21  2016 10035505_1968-09-03_1960.jpg


In [21]:
df = pd.DataFrame({"filename": list_of_files,
                   "year": list_of_birthyears})
len(df)

62328

In [22]:
# Check if each element in the 'column_name' can be converted to numeric (integers)
is_numeric = pd.to_numeric(df['year'], errors='coerce').notnull()

# Use boolean indexing to select rows where 'is_numeric' is True
df = df[is_numeric]

In [23]:
df

Unnamed: 0,filename,year
0,11332596_1970-11-05_2008.jpg,1970
1,262806_1967-10-16_2009.jpg,1967
2,2749305_1975-09-19_2009.jpg,1975
3,28866361_1989-12-13_2015.jpg,1989
4,5736781_1961-02-17_2008.jpg,1961
...,...,...
62323,4185859_1947-01-09_2006.jpg,1947
62324,921527_1987-12-31_2005.jpg,1987
62325,2559383_1905-01-25_1945.jpg,1905
62326,5492397_1981-07-26_2013.jpg,1981


In [24]:
df['year'] = df['year'].astype('int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62328 entries, 0 to 62327
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  62328 non-null  object
 1   year      62328 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.4+ MB


In [25]:
df_zodiac = pd.read_csv(f"{judgebook_dir}/zodiac_sign.csv", sep=";")
df_zodiac

Unnamed: 0,year,zodiac_sign,trait_1,trait_2,trait_3
0,1900,rat,witty,creative,curious
1,1901,ox,strong,reliable,determined
2,1902,tiger,brave,competitive,charming
3,1903,rabbit,elegant,responsible,friendly
4,1904,dragon,ambitious,enthusiastic,passionate
...,...,...,...,...,...
120,2020,rat,witty,creative,curious
121,2021,ox,strong,reliable,determined
122,2022,tiger,brave,competitive,charming
123,2023,rabbit,elegant,responsible,friendly


In [26]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse = False)

ohe.fit(df_zodiac[['trait_1', 'trait_2', 'trait_3']])

print(f"The categories detected by the OneHotEncoder are {ohe.categories_}")

The categories detected by the OneHotEncoder are [array(['ambitious', 'brave', 'cheerful', 'confident', 'elegant',
       'generous', 'loyal', 'sharp', 'shy', 'strong', 'wise', 'witty'],
      dtype=object), array(['compassionate', 'competitive', 'creative', 'effective',
       'energetic', 'enthusiastic', 'hardworking', 'intelligent', 'kind',
       'reliable', 'responsible', 'sympathetic'], dtype=object), array(['active', 'analytical', 'calm', 'charming', 'curious',
       'determined', 'diligent', 'friendly', 'honest', 'lovely',
       'passionate', 'playful'], dtype=object)]




In [27]:
# Display the generated names
print(f"The column names for the encoded values are {ohe.get_feature_names_out()}")

# Transform the current "['trait_1', 'trait_2', 'trait_3']" columns
df_zodiac[ohe.get_feature_names_out()] = ohe.transform(df_zodiac[['trait_1', 'trait_2', 'trait_3']])

# Drop the columns "['trait_1', 'trait_2', 'trait_3']" which have been encoded
df_zodiac.drop(columns = ['trait_1', 'trait_2', 'trait_3'], inplace = True)


# Show the dataset

df_zodiac.head(3)

The column names for the encoded values are ['trait_1_ambitious' 'trait_1_brave' 'trait_1_cheerful'
 'trait_1_confident' 'trait_1_elegant' 'trait_1_generous' 'trait_1_loyal'
 'trait_1_sharp' 'trait_1_shy' 'trait_1_strong' 'trait_1_wise'
 'trait_1_witty' 'trait_2_compassionate' 'trait_2_competitive'
 'trait_2_creative' 'trait_2_effective' 'trait_2_energetic'
 'trait_2_enthusiastic' 'trait_2_hardworking' 'trait_2_intelligent'
 'trait_2_kind' 'trait_2_reliable' 'trait_2_responsible'
 'trait_2_sympathetic' 'trait_3_active' 'trait_3_analytical'
 'trait_3_calm' 'trait_3_charming' 'trait_3_curious' 'trait_3_determined'
 'trait_3_diligent' 'trait_3_friendly' 'trait_3_honest' 'trait_3_lovely'
 'trait_3_passionate' 'trait_3_playful']


Unnamed: 0,year,zodiac_sign,trait_1_ambitious,trait_1_brave,trait_1_cheerful,trait_1_confident,trait_1_elegant,trait_1_generous,trait_1_loyal,trait_1_sharp,...,trait_3_calm,trait_3_charming,trait_3_curious,trait_3_determined,trait_3_diligent,trait_3_friendly,trait_3_honest,trait_3_lovely,trait_3_passionate,trait_3_playful
0,1900,rat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1901,ox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1902,tiger,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
df_zodiac.columns = df_zodiac.columns.str.removeprefix('trait_3_')
df_zodiac.columns = df_zodiac.columns.str.removeprefix('trait_1_')
df_zodiac.columns = df_zodiac.columns.str.removeprefix('trait_2_')

df_zodiac.info('')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125 entries, 0 to 124
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   year           125 non-null    int64  
 1   zodiac_sign    125 non-null    object 
 2   ambitious      125 non-null    float64
 3   brave          125 non-null    float64
 4   cheerful       125 non-null    float64
 5   confident      125 non-null    float64
 6   elegant        125 non-null    float64
 7   generous       125 non-null    float64
 8   loyal          125 non-null    float64
 9   sharp          125 non-null    float64
 10  shy            125 non-null    float64
 11  strong         125 non-null    float64
 12  wise           125 non-null    float64
 13  witty          125 non-null    float64
 14  compassionate  125 non-null    float64
 15  competitive    125 non-null    float64
 16  creative       125 non-null    float64
 17  effective      125 non-null    float64
 18  energetic 

In [29]:
# Merge df with df_zodiac

merged_df = pd.DataFrame.merge(df, df_zodiac, how='left', on='year')
merged_df.head()

Unnamed: 0,filename,year,zodiac_sign,ambitious,brave,cheerful,confident,elegant,generous,loyal,...,calm,charming,curious,determined,diligent,friendly,honest,lovely,passionate,playful
0,11332596_1970-11-05_2008.jpg,1970,dog,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,262806_1967-10-16_2009.jpg,1967,goat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2749305_1975-09-19_2009.jpg,1975,rabbit,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,28866361_1989-12-13_2015.jpg,1989,snake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5736781_1961-02-17_2008.jpg,1961,ox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df = merged_df.set_index('filename')
df

Unnamed: 0_level_0,year,zodiac_sign,ambitious,brave,cheerful,confident,elegant,generous,loyal,sharp,...,calm,charming,curious,determined,diligent,friendly,honest,lovely,passionate,playful
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11332596_1970-11-05_2008.jpg,1970,dog,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
262806_1967-10-16_2009.jpg,1967,goat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2749305_1975-09-19_2009.jpg,1975,rabbit,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
28866361_1989-12-13_2015.jpg,1989,snake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5736781_1961-02-17_2008.jpg,1961,ox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4185859_1947-01-09_2006.jpg,1947,pig,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
921527_1987-12-31_2005.jpg,1987,rabbit,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2559383_1905-01-25_1945.jpg,1905,snake,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5492397_1981-07-26_2013.jpg,1981,rooster,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [31]:
df.to_csv(f"{judgebook_dir}/final_df.csv")

In [33]:
df.loc['40140357_2001-03-31_2014.jpg'].to_numpy()

array([2001, 'snake', 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
      dtype=object)

In [34]:
path_to_pics = os.path.join(judgebook_dir, "dataset", "faces", "images")

In [35]:
path_to_pic = os.path.join(path_to_pics, "35079884_1994-08-07_2012.jpg")
image = Image.open(path_to_pic)
image = image.resize((256,256))
np.array(image).shape

(256, 256, 3)