# Take imdb.mat file and reshape its data to a dataframe for a gender classifier to be fed into ssd 

In [1]:
from data_generator.dataset_generator import *
from datetime import datetime, timedelta

## Reading dataset

In [2]:
data = read_data_from_mat('../../ssd_keras/dataset/csv/imdb_csv/imdb.mat')

In [3]:
df = get_dataframe_from_dictionary(data, 'imdb')

In [4]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


## Setting xmin, xmax, ymin, ymax and removing face_location column

In [5]:
df = set_face_locations(df)

In [6]:
df.sample(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
124352,712775,1996,[91/nm0001691_rm3735534080_1951-7-6_1996.jpg],1.0,[Geoffrey Rush],"[[1, 1, 1352, 2048]]",-inf,,6765,1.0,1352.0,1.0,2048.0
132451,720048,2006,[64/nm0001864_rm2942212352_1971-6-4_2006.jpg],1.0,[Noah Wyle],"[[224.8, 218.4, 281.6, 275.2]]",1.700289,1.650477,14686,224.8,281.6,218.4,275.2
203776,722957,2010,[72/nm0702572_rm252884992_1979-5-22_2010.jpg],0.0,[Maggie Q],"[[183.7205698523275, 43.377199632209745, 237.9...",0.797714,,12387,183.72057,237.973541,43.3772,97.630171
185882,722716,2010,[01/nm1107001_rm3959849216_1978-9-23_2010.jpg],1.0,[Anthony Mackie],"[[98.88004986325024, 66.13003324216682, 245.62...",3.998843,,1321,98.88005,245.625125,66.130033,212.875108
36504,719647,2004,[35/nm0000235_rm2711203840_1970-4-29_2004.jpg],0.0,[Uma Thurman],"[[640.8292765712268, 321.4386382856134, 1117.8...",2.816085,,19486,640.829277,1117.867234,321.438638,798.476596


## Fixing strings --> full path and name

In [7]:
df = fix_full_path_no_thread(df, 'imdb')

In [8]:
df.loc[:, 'name'] = df.loc[:, 'name'].apply(lambda item: item[0])

In [9]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168


## Getting age

In [10]:
df['dob_date'] = df['dob'].apply(lambda item: set_dob_datetime(item))

In [11]:
df['age'] = df['photo_taken'] - df['dob_date'].dt.year

In [12]:
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax,dob_date,age
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696,1899-05-10,69.0
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76,1899-05-10,71.0
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572,1899-05-10,69.0
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006,1899-05-10,69.0
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,69.0


## Dropping unnecessary columns and NaN data

In [13]:
df.drop(['face_location', 'dob'], inplace=True, axis=1)

In [14]:
df.dropna(subset=['age', 'dob_date', 'gender'], axis=0, inplace=True)

In [15]:
df.isnull().sum()

photo_taken               0
full_path                 0
gender                    0
name                      0
face_score                0
second_face_score    242679
celeb_id                  0
xmin                      0
xmax                      0
ymin                      0
ymax                      0
dob_date                  0
age                       0
dtype: int64

## Set features order

In [16]:
df = df[['celeb_id', 'name', 'full_path', 'face_score','second_face_score',
         'xmin','xmax','ymin', 'ymax', 'dob_date', 'photo_taken', 'age', 'gender']]

In [17]:
df.head()

Unnamed: 0,celeb_id,name,full_path,face_score,second_face_score,xmin,xmax,ymin,ymax,dob_date,photo_taken,age,gender
0,6488,Fred Astaire,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.459693,1.118973,1072.926,1214.784,161.838,303.696,1899-05-10,1968,69.0,1.0
1,6488,Fred Astaire,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,2.543198,1.852008,477.184,622.592,100.352,245.76,1899-05-10,1970,71.0,1.0
2,6488,Fred Astaire,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,3.455579,2.98566,114.969643,451.686572,114.969643,451.686572,1899-05-10,1968,69.0,1.0
3,6488,Fred Astaire,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.872117,,622.885506,844.339008,424.217504,645.671006,1899-05-10,1968,69.0,1.0
4,6488,Fred Astaire,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.158766,,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,1968,69.0,1.0


## Saving complete clean dataset

In [18]:
df_age = set_labels_for_age(df).sample(frac=1).reset_index(drop=True)
df_gender = set_labels_for_gender(df).sample(frac=1).reset_index(drop=True)

In [19]:
df_age.head(5)

Unnamed: 0,image_name,xmin,xmax,ymin,ymax,class_id
0,imdb/56/nm3041056_rm2519500800_1979-3-9_2009.jpg,215.67,328.64,101.91,214.88,30.0
1,imdb/36/nm0000136_rm2024969984_1963-6-9_2008.jpg,90.528434,179.968869,60.533623,149.974057,45.0
2,imdb/94/nm1475594_rm1668853760_1980-4-26_2011.jpg,1.0,1362.0,1.0,2048.0,31.0
3,imdb/67/nm1706767_rm3823278336_1983-12-20_2007...,334.012418,383.36908,78.433031,127.789693,24.0
4,imdb/05/nm0005305_rm4022917376_1976-10-31_2010...,256.8,371.2,77.6,192.0,34.0


In [20]:
df_gender.head(5)

Unnamed: 0,image_name,xmin,xmax,ymin,ymax,class_id
0,imdb/23/nm1724323_rm2806575104_1979-7-16_2015.jpg,1.0,2560.0,1.0,1728.0,0.0
1,imdb/17/nm0057217_rm1699188224_1980-2-24_2009.jpg,289.330097,450.743902,109.032537,270.446341,1.0
2,imdb/61/nm0919361_rm668377856_1943-5-27_1983.jpg,67.584,212.992,198.656,344.064,1.0
3,imdb/49/nm0080049_rm1792904192_1973-12-11_2006...,1125.88393,1439.914786,283.006983,597.037838,1.0
4,imdb/86/nm0001086_rm4002413568_1965-1-27_2015.jpg,657.408,802.816,247.808,393.216,1.0


In [21]:
df_age.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_age.csv')
df_gender.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_gender.csv')

In [22]:
df.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb.csv')