# Take imdb.mat file and reshape its data to a dataframe for a gender classifier to be fed into ssd 

In [18]:
from data_generator.dataset_generator import *
from datetime import datetime, timedelta

## Reading dataset

In [2]:
data = read_data_from_mat('../../ssd_keras/dataset/csv/imdb_csv/imdb.mat')

In [3]:
df = get_dataframe_from_dictionary(data, 'imdb')

In [4]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


## Setting xmin, xmax, ymin, ymax and removing face_location column

In [5]:
df = set_face_locations(df)

In [6]:
df.sample(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
437094,727257,2011,[61/nm0092961_rm872328192_1991-2-28_2011.jpg],0.0,[Sarah Bolger],"[[1, 1, 426, 640]]",-inf,,17189,1.0,426.0,1.0,640.0
294884,718984,2012,[21/nm0836121_rm557233152_1968-7-5_2012.jpg],1.0,[Michael Stuhlbarg],"[[690.176, 133.12, 983.04, 425.984]]",1.040922,0.846652,13727,690.176,983.04,133.12,425.984
119556,720269,2013,[05/nm0001605_rm3385501184_1972-1-11_2013.jpg],0.0,[Amanda Peet],"[[1292.9964843675587, 183.58763061418796, 1472...",4.538516,3.065267,760,1292.996484,1472.488115,183.587631,363.079261
139144,705263,2012,[62/nm0004462_rm2042410240_1930-12-11_2012.jpg],1.0,[Jean-Louis Trintignant],"[[405.4694013648621, 486.15368163783455, 1129....",4.442067,,8651,405.469401,1129.579924,486.153682,1210.264204
107112,713359,2009,[54/nm0001354_rm1924827904_1953-2-9_2009.jpg],1.0,[Ciarán Hinds],"[[283.85424128182973, 54.004295240343076, 362....",1.770514,,3750,283.854241,362.455684,54.004295,132.605738


## Fixing strings --> full path and name

In [7]:
df = fix_full_path_no_thread(df, 'imdb')

In [8]:
df.loc[:, 'name'] = df.loc[:, 'name'].apply(lambda item: item[0])

In [9]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168


## Getting age

In [10]:
df['dob_date'] = df['dob'].apply(lambda item: set_dob_datetime(item))

In [11]:
df['age'] = df['photo_taken'] - df['dob_date'].dt.year

In [12]:
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax,dob_date,age
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696,1899-05-10,69.0
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76,1899-05-10,71.0
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572,1899-05-10,69.0
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006,1899-05-10,69.0
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,69.0


## Dropping unnecessary columns and NaN data

In [13]:
df.drop(['face_location', 'dob'], inplace=True, axis=1)

In [14]:
df.dropna(subset=['age', 'dob_date', 'gender'], axis=0, inplace=True)

In [19]:
df.isnull().sum()

photo_taken               0
full_path                 0
gender                    0
name                      0
face_score                0
second_face_score    242679
celeb_id                  0
xmin                      0
xmax                      0
ymin                      0
ymax                      0
dob_date                  0
age                       0
dtype: int64

## Set features order

In [None]:
df = df[['celeb_id', 'name', 'full_path', 'face_score','second_face_score',
         'xmin','xmax','ymin', 'ymax', 'dob_date', 'photo_taken', 'age', 'gender']]

## Saving complete clean dataset

In [20]:
df_age = set_labels_for_age(df).sample(frac=1).reset_index(drop=True)
df_gender = set_labels_for_gender(df).sample(frac=1).reset_index(drop=True)

NameError: name 'df' is not defined

In [None]:
df_age.head(5)

In [None]:
df_gender.head(5)

In [None]:
df_age.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_age.csv')
df_gender.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb_gender.csv')

In [None]:
df.to_csv('../../ssd_keras/dataset/csv/imdb_csv/imdb.csv')