In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

wikidata = pd.read_json('wikidata-movies.json.gz', orient='record', lines=True, encoding="utf8")
rotten_tomato = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)


In [2]:
# Here we will consider a movie good if it has a critic rating > 80% on rotten tomatoes.
rotten_tomato = rotten_tomato[['rotten_tomatoes_id', 'critic_percent']]
rotten_tomato['good'] = (rotten_tomato['critic_percent'] >= 80)
rotten_tomato = rotten_tomato.drop(columns=['critic_percent'])
rotten_tomato = rotten_tomato.set_index('rotten_tomatoes_id')
# rotten_tomato

In [3]:
wikidata_with_cast = wikidata[wikidata.cast_member.notna()]

In [4]:
# wikidata_with_cast

Unnamed: 0,based_on,cast_member,country_of_origin,director,enwiki_title,filming_location,genre,imdb_id,label,made_profit,main_subject,metacritic_id,original_language,publication_date,rotten_tomatoes_id,series,wikidata_id
0,,"[Q228931, Q235384]",Q145,[Q43079418],If I Were You (2012 Canadian film),,[Q859369],tt1587309,If I Were You,,,,Q1860,2012-01-01,m/if_i_were_you_2012,,Q43079072
2,,"[Q106303, Q467957, Q3345693, Q3340838, Q316419...",Q142,[Q724208],Le Brio,[Q50742],[Q157443],tt6462462,,,,,Q150,2017-01-01,m/le_brio,,Q43400054
6,,"[Q5126010, Q3390414, Q5676024, Q237021]",Q29,[Q51892574],Orbiter 9,,"[Q24925, Q21010853]",tt3469798,Orbiter 9,0.0,,,,2017-04-07,m/orbiter_9,,Q42577704
7,Q18214949,"[Q202381, Q7367121, Q179576, Q20630818, Q28474...",Q30,[Q18608206],The Kindergarten Teacher (2018 film),,[Q130232],tt6952960,The Kindergarten Teacher,,,,Q1860,2018-01-01,m/the_kindergarten_teacher_2018,,Q47461695
10,,"[Q41548, Q15712136, Q1104856]",Q219,[Q768543],Acts of Vengeance (film),,[Q188473],tt6288694,Acts of Vengeance,,,movie/acts-of-vengeance,Q1860,2017-01-01,m/acts_of_vengeance,,Q42289722
12,,"[Q41548, Q104514, Q172261, Q431038]",Q219,[Q15439821],Bullet Head,,"[Q959790, Q182015]",tt6204340,Bullet Head,,,movie/bullet-head,Q1860,2017-01-01,m/bullet_head,,Q42843713
13,,[Q42101],Q30,[Q465139],The Equalizer 2,,"[Q2484376, Q188473]",tt3766354,The Equalizer 2,,,,Q1860,2018-08-30,m/the_equalizer_2,,Q41493467
14,,"[Q229291, Q296616, Q940891, Q2558129, Q19661737]",Q30,,Hereditary (film),,[Q200092],tt7784604,Hereditary,,,movie/hereditary,Q1860,2018-01-21,m/hereditary,,Q47524071
16,,"[Q4120019, Q20973952, Q29622423, Q27044068, Q3...",Q30,[Q27044648],Monsters and Men,,[Q130232],tt7689906,Monsters and Men,,,,Q1860,2018-01-01,m/monsters_and_men,,Q47461687
19,,"[Q254274, Q12225006, Q12249859, Q4165417, Q309...",Q79,[Q312611],The Choice (1970 film),,[Q130232],tt0065879,The Choice,,,,Q13955,1970-01-01,m/alikhtiyar_the_choice,,Q38905900


In [5]:
cast_members_by_movie = wikidata_with_cast[['cast_member', 'rotten_tomatoes_id']]

In [6]:
# cast_members_by_movie.merge(rotten_tomato, on='rotten_tomatoes_id')

Unnamed: 0,cast_member,rotten_tomatoes_id,good
0,"[Q228931, Q235384]",m/if_i_were_you_2012,False
1,"[Q106303, Q467957, Q3345693, Q3340838, Q316419...",m/le_brio,True
2,"[Q5126010, Q3390414, Q5676024, Q237021]",m/orbiter_9,False
3,"[Q202381, Q7367121, Q179576, Q20630818, Q28474...",m/the_kindergarten_teacher_2018,True
4,"[Q41548, Q15712136, Q1104856]",m/acts_of_vengeance,False
5,"[Q41548, Q104514, Q172261, Q431038]",m/bullet_head,False
6,[Q42101],m/the_equalizer_2,False
7,"[Q229291, Q296616, Q940891, Q2558129, Q19661737]",m/hereditary,True
8,"[Q4120019, Q20973952, Q29622423, Q27044068, Q3...",m/monsters_and_men,True
9,"[Q254274, Q12225006, Q12249859, Q4165417, Q309...",m/alikhtiyar_the_choice,False


In [7]:
# Sample a small number for testing
# cast_members_by_movie = cast_members_by_movie.head(200)


cast_members_by_movie = cast_members_by_movie.cast_member.apply(pd.Series) \
    .merge(cast_members_by_movie, left_index = True, right_index = True) \
    .drop(["cast_member"], axis = 1) \
    .melt(id_vars = ['rotten_tomatoes_id'], value_name = "cast_member") \
    .drop('variable', axis = 1) \
    .dropna()

In [8]:
cast_members_by_movie = cast_members_by_movie.set_index('rotten_tomatoes_id')

In [9]:
cast_members_by_movie_with_rating = rotten_tomato.join(cast_members_by_movie)

In [10]:
cast_members_by_movie_with_rating = cast_members_by_movie_with_rating.dropna()

In [11]:
cast_members_by_movie_with_rating

Unnamed: 0_level_0,good,cast_member
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1
m/033,False,Q7377514
m/0815_1954,False,Q78119
m/0815_1954,False,Q119256
m/0815_1954,False,Q2059082
m/0815_1954,False,Q1314547
m/0815_1954,False,Q101059
m/0815_1954,False,Q123674
m/0815_1954,False,Q67723
m/0815_1954,False,Q2571363
m/0815_1954,False,Q61322


In [12]:
# Using 26Gb of memory here - could convert to spark job
categorical_rep_of_cast_in_movies = pd.get_dummies(cast_members_by_movie_with_rating['cast_member'])

In [13]:
categorical_rep_of_cast_in_movies

Unnamed: 0_level_0,Q1000015,Q100028,Q1000309,Q1000408,Q1000626,Q1000748,Q1001130,Q1001175,Q1001214,Q100122,...,Q99913,Q999232,Q999260,Q999296,Q999318,Q999332,Q999769,Q999866,Q999901,Q999960
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m/033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/0815_1954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
categorical_rep_of_cast_in_movies = categorical_rep_of_cast_in_movies.groupby('rotten_tomatoes_id').any().astype(int)

MemoryError: 

In [None]:
categorical_rep_of_cast_in_movies[categorical_rep_of_cast_in_movies['Q101059'] == 1]

In [None]:
categorical_rep_of_cast_in_movies.to_csv('categorized_actors.csv') 