In [22]:
import json
from itertools import groupby

In [23]:
def merge_two_dicts(x, y):
    """Given two dicts, merge them into a new dict as a shallow copy."""
    z = x.copy()
    z.update(y)
    return z

In [24]:
data = []

with open('../../data/raw/themoviedb-4-16-2017.json') as data_file:
    data.extend(json.loads(data_file.read()))

print(len(data))

51512


In [None]:
# Count the number of people in each department

for m in data:
    m['crew_count'] = len(m.get('crew', []))
    m['cast_count'] = len(m.get('cast', []))
    m['genre_count'] = len(m.get('genres', []))
    m['release_date_count'] = len(m.get('release_dates', []))

In [None]:
index = 0
total_movies = len(data) -1

while(index <= total_movies):
    m = data[index]
    
    # Flatten crew
    groups = []
    crew_dict = dict()

    for key, group in groupby(m.get('crew', []), lambda x: x['department'] + x['job']):
        groups.append(list(group))    # Store group iterator as a list
        for group in groups:
            group_dict = dict()
            for i, j in enumerate(group):
                if i is 0:
                    group_dict["{}.{}".format(j['department'],j['job'])] =  j['name'] 
                else:
                    group_dict["{}.{}.{}".format(j['department'],i,j['job'])] = j['name'] 
                    
            crew_dict = merge_two_dicts(group_dict, crew_dict)
    
    groups = []
    cast_dict = dict()

    # Flatten cast
    for key, group in groupby(m.get('cast', []), lambda x: x['order']):
        groups.append(list(group))    # Store group iterator as a list
        for group in groups:
            group_dict = {"cast.{}".format(j['order']): j['name'] for i, j in enumerate(group)}
        
        cast_dict = merge_two_dicts(group_dict, cast_dict)
        
    # Spoken Lang
    
    spoken_lang = {"spoken.{}".format(lang['iso_639_1']): 1  for lang in m.get('spoken_languages', []) }
    
    # Production companies
    prod_companies = {"production_company.{}".format(comp['name']): 1  for comp in m.get('production_companies', [])}
    
    # Production Countries
    prod_countries = {"production_country.{}".format(comp['iso_3166_1']): 1  for comp in m.get('production_countries', []) }

    # Released countries
    released_countries = {"released_country.{}".format(comp['iso_3166_1']): 1  for comp in m.get('release_dates', [])}

    # Merge dictionaries

    m = merge_two_dicts(m, crew_dict)
    m = merge_two_dicts(m, cast_dict)
    m = merge_two_dicts(m, spoken_lang)
    m = merge_two_dicts(m, prod_companies)
    m = merge_two_dicts(m, prod_countries)
    m = merge_two_dicts(m, released_countries)
    
    # Remove keys
    m.pop('cast', None)
    m.pop('crew', None)
    m.pop('spoken_languages', None)
    m.pop('production_countries', None)
    m.pop('production_companies', None)
    m.pop('release_dates', None)
    data[index] = m
    
    index = index + 1
    
print(len(data))

In [None]:
memo = list()

import copy

for sample in data:
    for genre in sample['genres']:
        sample_copy = copy.deepcopy(sample)
        sample_copy['genre'] = genre['name']
        
        memo.append(sample_copy)

In [None]:
len(memo)

In [None]:
with open('../../data/processed/themoviedb-4-18-2017.json', 'w') as outfile:
    json.dump(memo, outfile)

In [None]:
memo[-5]