# Take imdb.mat file and sends to a csv file

In [1]:
from data_generator.dataset_generator import *
from datetime import datetime, timedelta

## Reading dataset

In [2]:
data = read_data_from_mat('/home/nicoli/github/ssd_keras/dataset/csv/imdb_csv/imdb.mat')

In [3]:
df = get_dataframe_from_dictionary(data, 'imdb')

In [4]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id
0,693726,1968,[01/nm0000001_rm124825600_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488
1,693726,1970,[01/nm0000001_rm3343756032_1899-5-10_1970.jpg],1.0,[Fred Astaire],"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488
2,693726,1968,[01/nm0000001_rm577153792_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488
3,693726,1968,[01/nm0000001_rm946909184_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488
4,693726,1968,[01/nm0000001_rm980463616_1899-5-10_1968.jpg],1.0,[Fred Astaire],"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488


## Setting xmin, xmax, ymin, ymax and removing face_location column

In [5]:
df = set_face_locations(df)

In [6]:
df.sample(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
366203,721871,2003,[99/nm0268199_rm809080832_1976-5-31_2003.jpg],1.0,[Colin Farrell],"[[110.71297244015217, 110.71297244015217, 307....",3.562032,,3914,110.712972,307.280323,110.712972,307.280323
129243,720145,2002,[94/nm0001794_rm2793052928_1971-9-9_2002.jpg],1.0,[Henry Thomas],"[[60.97872305968401, 81.12229741291202, 241.72...",4.835529,,7554,60.978723,241.722892,81.122297,261.866467
439082,723344,2011,[03/nm2092503_rm389657344_1980-6-12_2011.jpg],0.0,[Carly Craig],"[[458.4672137293982, 458.4672137293982, 969.89...",4.029924,,2947,458.467214,969.890829,458.467214,969.890829
44878,710288,1999,[02/nm0000302_rm3376126720_1944-9-13_1999.jpg],0.0,[Jacqueline Bisset],"[[120.96634595104916, 40.68611531701639, 165.5...",3.271023,3.186312,8080,120.966346,165.577976,40.686115,85.297745
157170,719216,2015,[48/nm0005048_rm1425076224_1969-2-22_2015.jpg],1.0,[Thomas Jane],"[[1, 1, 4800, 3600]]",-inf,,18847,1.0,4800.0,1.0,3600.0


## Fixing strings --> full path and name

In [7]:
df = fix_full_path_no_thread(df, 'imdb')

In [8]:
df.loc[:, 'name'] = df.loc[:, 'name'].apply(lambda item: item[0])

In [9]:
df.head(5)

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168


## Getting age

In [10]:
df['dob_date'] = df['dob'].apply(lambda item: set_dob_datetime(item))

In [11]:
df['age'] = df['photo_taken'] - df['dob_date'].dt.year

In [12]:
df.head()

Unnamed: 0,dob,photo_taken,full_path,gender,name,face_location,face_score,second_face_score,celeb_id,xmin,xmax,ymin,ymax,dob_date,age
0,693726,1968,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1072.926, 161.838, 1214.7839999999999, 303.6...",1.459693,1.118973,6488,1072.926,1214.784,161.838,303.696,1899-05-10,69.0
1,693726,1970,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,1.0,Fred Astaire,"[[477.184, 100.352, 622.592, 245.76]]",2.543198,1.852008,6488,477.184,622.592,100.352,245.76,1899-05-10,71.0
2,693726,1968,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[114.96964308962852, 114.96964308962852, 451....",3.455579,2.98566,6488,114.969643,451.686572,114.969643,451.686572,1899-05-10,69.0
3,693726,1968,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[622.8855056426588, 424.21750383700805, 844.3...",1.872117,,6488,622.885506,844.339008,424.217504,645.671006,1899-05-10,69.0
4,693726,1968,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.0,Fred Astaire,"[[1013.8590023603723, 233.8820422075853, 1201....",1.158766,,6488,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,69.0


## Dropping unnecessary columns

In [13]:
df.drop(['face_location', 'dob'], inplace=True, axis=1)

## Set features order

In [14]:
df = df[['celeb_id', 'name', 'full_path', 'face_score','second_face_score',
         'xmin','xmax','ymin', 'ymax', 'dob_date', 'photo_taken', 'age', 'gender']]

In [15]:
df.head()

Unnamed: 0,celeb_id,name,full_path,face_score,second_face_score,xmin,xmax,ymin,ymax,dob_date,photo_taken,age,gender
0,6488,Fred Astaire,imdb/01/nm0000001_rm124825600_1899-5-10_1968.jpg,1.459693,1.118973,1072.926,1214.784,161.838,303.696,1899-05-10,1968,69.0,1.0
1,6488,Fred Astaire,imdb/01/nm0000001_rm3343756032_1899-5-10_1970.jpg,2.543198,1.852008,477.184,622.592,100.352,245.76,1899-05-10,1970,71.0,1.0
2,6488,Fred Astaire,imdb/01/nm0000001_rm577153792_1899-5-10_1968.jpg,3.455579,2.98566,114.969643,451.686572,114.969643,451.686572,1899-05-10,1968,69.0,1.0
3,6488,Fred Astaire,imdb/01/nm0000001_rm946909184_1899-5-10_1968.jpg,1.872117,,622.885506,844.339008,424.217504,645.671006,1899-05-10,1968,69.0,1.0
4,6488,Fred Astaire,imdb/01/nm0000001_rm980463616_1899-5-10_1968.jpg,1.158766,,1013.859002,1201.586128,233.882042,421.609168,1899-05-10,1968,69.0,1.0


## Saving complete clean dataset

In [16]:
df.to_csv('/home/nicoli/github/ssd_keras/dataset/csv/imdb_csv/imdb.csv')