# Take imdb.mat file and reshape its data to a dataframe for a gender classifier to be fed into ssd 

In [1]:
from data_generator.dataset_generator import *
from datetime import datetime, timedelta

## Reading dataset

In [2]:
data = read_data_from_mat('../../ssd_keras/dataset/csv/imdb_csv/imdb.mat')

In [3]:
df = get_dataframe_from_dictionary(data, 'imdb')

In [4]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


## Setting xmin, xmax, ymin, ymax and removing face_location column

In [5]:
df = set_face_locations(df)

In [6]:
df.sample(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
372272,723324,2014,[12/nm1460812_rm4183152128_1980-5-23_2014.jpg],1.0,[Lane Garrison],"[[526.336, 264.192, 1114.112, 851.968]]",3.860799,,11386,526.336,1114.112,264.192,851.968
344522,720072,2015,[68/nm0553468_rm3404657152_1971-6-28_2015.jpg],1.0,[Benito Martinez],"[[296.4746592027799, 79.79324245407463, 472.75...",4.228124,,1908,296.474659,472.759455,79.793242,256.078038
152622,716184,2008,[80/nm0004980_rm2024770304_1960-11-4_2008.jpg],0.0,[Kathy Griffin],"[[1, 1, 240, 360]]",-inf,,10659,1.0,240.0,1.0,360.0
183692,721450,2004,[85/nm0103785_rm3650456576_1975-4-6_2004.jpg],1.0,[Zach Braff],"[[943.061692413571, 152.61019078617136, 1279.7...",3.93027,,20170,943.061692,1279.778622,152.610191,489.32712
144368,720728,2004,[78/nm0004778_rm61312000_1973-4-14_2004.jpg],1.0,[Adrien Brody],"[[870.4, 362.496, 1015.808, 507.904]]",2.537934,2.219809,211,870.4,1015.808,362.496,507.904


## Fixing strings --> full path and name

In [7]:
df = fix_full_path_no_thread(df, 'imdb')

In [8]:
df.loc[:, 'name'] = df.loc[:, 'name'].apply(lambda item: item[0])

In [9]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168


## Getting age

In [10]:
df['dob_date'] = df['dob'].apply(lambda item: set_dob_datetime(item))

In [11]:
df['age'] = df['photo_taken'] - df['dob_date'].dt.year

In [12]:
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax,dob_date,age
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696,1899-05-10,69.0
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76,1899-05-10,71.0
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572,1899-05-10,69.0
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006,1899-05-10,69.0
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,69.0


## Dropping unnecessary columns and NaN data

In [13]:
df.drop(['face_location', 'dob'], inplace=True, axis=1)

In [14]:
df.dropna(subset=['age', 'dob_date', 'gender'], axis=0, inplace=True)

In [15]:
df.isnull().sum()

photo_taken               0
full_path                 0
gender                    0
name                      0
face_score                0
second_face_score    242679
celeb_id                  0
xmin                      0
xmax                      0
ymin                      0
ymax                      0
dob_date                  0
age                       0
dtype: int64

## Set features order

In [16]:
df = df[['celeb_id', 'name', 'full_path', 'face_score','second_face_score',
         'xmin','xmax','ymin', 'ymax', 'dob_date', 'photo_taken', 'age', 'gender']]

## Saving complete clean dataset

In [17]:
df_age = set_labels_for_age(df).sample(frac=1).reset_index(drop=True)
df_gender = set_labels_for_gender(df).sample(frac=1).reset_index(drop=True)

In [None]:
df_age.head(5)

In [None]:
df_gender.head(5)

In [None]:
df_age.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_age.csv')
df_gender.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_gender.csv')

In [None]:
df.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb.csv')