# Take imdb.mat file and sends to a csv file

In [1]:
from data_generator.dataset_generator import *
from datetime import datetime, timedelta

## Reading dataset

In [2]:
data = read_data_from_mat('../../ssd_keras/dataset/csv/imdb_csv/imdb.mat')

In [3]:
df = get_dataframe_from_dictionary(data, 'imdb')

In [4]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


## Setting xmin, xmax, ymin, ymax and removing face_location column

In [5]:
df = set_face_locations(df)

In [6]:
df.sample(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
385860,718836,1978,[41/nm0171041_rm377014272_1968-2-8_1978.jpg],1.0,[Gary Coleman],"[[774.9252945580075, 1216.025462876869, 1268.1...",2.074168,,6654,774.925295,1268.162984,1216.025463,1709.263152
369831,722705,2014,[70/nm1360270_rm3117479936_1978-9-12_2014.jpg],1.0,[Ben McKenzie],"[[1266.363421534013, 212.7672369223355, 1422.3...",1.908203,,1876,1266.363422,1422.354849,212.767237,368.758665
235745,724387,2014,[21/nm1813221_rm693423360_1983-4-21_2014.jpg],0.0,[Gugu Mbatha-Raw],"[[454.51807090461904, 534.3657304760225, 692.0...",3.430632,2.106942,7193,454.518071,692.01305,534.36573,771.860709
331502,717962,2006,[19/nm0151419_rm1943705856_1965-9-17_2006.jpg],1.0,[Kyle Chandler],"[[168.6339224768461, 69.75396807870135, 257.08...",4.061372,,11288,168.633922,257.087881,69.753968,158.207927
71928,713783,2002,[98/nm0000598_rm1386715392_1954-4-9_2002.jpg],1.0,[Dennis Quaid],"[[273.5566489955124, 68.8916622488781, 341.108...",0.951778,,4997,273.556649,341.108311,68.891662,136.443324


## Fixing strings --> full path and name

In [7]:
df = fix_full_path_no_thread(df, 'imdb')

In [8]:
df.loc[:, 'name'] = df.loc[:, 'name'].apply(lambda item: item[0])

In [9]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168


## Getting age

In [10]:
df['dob_date'] = df['dob'].apply(lambda item: set_dob_datetime(item))

In [11]:
df['age'] = df['photo_taken'] - df['dob_date'].dt.year

In [16]:
df.head()

Unnamed: 0,celeb_id,name,full_path,face_score,second_face_score,xmin,xmax,ymin,ymax,dob_date,photo_taken,age,gender
0,6488,Fred Astaire,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.459693,1.118973,1072.926,1214.784,161.838,303.696,1899-05-10,1968,69.0,1.0
1,6488,Fred Astaire,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,2.543198,1.852008,477.184,622.592,100.352,245.76,1899-05-10,1970,71.0,1.0
2,6488,Fred Astaire,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,3.455579,2.98566,114.969643,451.686572,114.969643,451.686572,1899-05-10,1968,69.0,1.0
3,6488,Fred Astaire,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.872117,,622.885506,844.339008,424.217504,645.671006,1899-05-10,1968,69.0,1.0
4,6488,Fred Astaire,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.158766,,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,1968,69.0,1.0


## Dropping unnecessary columns

In [13]:
df.drop(['face_location', 'dob'], inplace=True, axis=1)

## Set features order

In [17]:
df = df[['celeb_id', 'name', 'full_path', 'face_score','second_face_score',
         'xmin','xmax','ymin', 'ymax', 'dob_date', 'photo_taken', 'age', 'gender']]

In [18]:
df.head()

Unnamed: 0,celeb_id,name,full_path,face_score,second_face_score,xmin,xmax,ymin,ymax,dob_date,photo_taken,age,gender
0,6488,Fred Astaire,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.459693,1.118973,1072.926,1214.784,161.838,303.696,1899-05-10,1968,69.0,1.0
1,6488,Fred Astaire,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,2.543198,1.852008,477.184,622.592,100.352,245.76,1899-05-10,1970,71.0,1.0
2,6488,Fred Astaire,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,3.455579,2.98566,114.969643,451.686572,114.969643,451.686572,1899-05-10,1968,69.0,1.0
3,6488,Fred Astaire,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.872117,,622.885506,844.339008,424.217504,645.671006,1899-05-10,1968,69.0,1.0
4,6488,Fred Astaire,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.158766,,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,1968,69.0,1.0


## Saving complete clean dataset

In [19]:
df.to_csv('/home/nicoli/github/ssd_keras/dataset/csv/imdb_csv/imdb.csv')