In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

wikidata = pd.read_json('wikidata-movies.json.gz', orient='record', lines=True, encoding="utf8")
rotten_tomato = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)


In [2]:
# Here we will consider a movie good if it has a critic rating > 80% on rotten tomatoes.
# Drop all records with sub 30% critic score
rotten_tomato = rotten_tomato[rotten_tomato['critic_percent'] >= 30]


rotten_tomato = rotten_tomato[['rotten_tomatoes_id', 'critic_percent']]
rotten_tomato['good'] = (rotten_tomato['critic_percent'] >= 80)
rotten_tomato = rotten_tomato.drop(columns=['critic_percent'])
rotten_tomato = rotten_tomato.set_index('rotten_tomatoes_id')


In [3]:
# rotten_tomato

In [4]:
wikidata_with_cast = wikidata[wikidata.cast_member.notna()]
wikidata_with_cast = wikidata_with_cast[wikidata_with_cast.cast_member.map(len)>3]

In [5]:
cast_members_by_movie = wikidata_with_cast[['cast_member', 'rotten_tomatoes_id']]

In [6]:
# cast_members_by_movie.merge(rotten_tomato, on='rotten_tomatoes_id')

In [7]:
# Sample a small number for testing
cast_members_by_movie = cast_members_by_movie.head(20000)


cast_members_by_movie = cast_members_by_movie.cast_member.apply(pd.Series) \
    .merge(cast_members_by_movie, left_index = True, right_index = True) \
    .drop(["cast_member"], axis = 1) \
    .melt(id_vars = ['rotten_tomatoes_id'], value_name = "cast_member") \
    .drop('variable', axis = 1) \
    .dropna()

In [8]:
# cast_members_by_movie = cast_members_by_movie.set_index('rotten_tomatoes_id')

In [9]:
cast_members_by_movie = cast_members_by_movie.sort_values(['rotten_tomatoes_id'], ascending=True).set_index('rotten_tomatoes_id')

In [10]:
cast_members_by_movie_with_rating = rotten_tomato.join(cast_members_by_movie)

In [11]:
cast_members_by_movie_with_rating = cast_members_by_movie_with_rating.dropna()

In [12]:
cast_members_by_movie_with_rating

Unnamed: 0_level_0,good,cast_member
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1
m/10,False,Q258064
m/10,False,Q1830242
m/10,False,Q2324150
m/10,False,Q234474
m/10,False,Q229477
m/10,False,Q3024455
m/10,False,Q161819
m/10,False,Q2928490
m/10,False,Q313545
m/10,False,Q1498498


In [13]:
# Using 26Gb of memory here - could convert to spark job
categorical_rep_of_cast_in_movies = pd.get_dummies(cast_members_by_movie_with_rating['cast_member'])
categorical_rep_of_cast_in_movies = categorical_rep_of_cast_in_movies.drop([col for col, val in categorical_rep_of_cast_in_movies.sum().iteritems() if val < 10], axis=1)

In [14]:
categorical_rep_of_cast_in_movies

Unnamed: 0_level_0,Q100937,Q101797,Q102124,Q102462,Q102551,Q102642,Q102711,Q103157,Q1033016,Q103343,...,Q978706,Q978857,Q979428,Q979778,Q980143,Q983020,Q983229,Q987741,Q9960,Q999332
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
categorical_rep_of_cast_in_movies = categorical_rep_of_cast_in_movies.groupby('rotten_tomatoes_id').any().astype(int)

In [16]:
categorical_rep_of_cast_in_movies_with_rating = categorical_rep_of_cast_in_movies.join(rotten_tomato)

In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(categorical_rep_of_cast_in_movies_with_rating.loc[:, categorical_rep_of_cast_in_movies_with_rating.columns != 'good'], categorical_rep_of_cast_in_movies_with_rating['good'])

In [19]:
X_train

Unnamed: 0_level_0,Q100937,Q101797,Q102124,Q102462,Q102551,Q102642,Q102711,Q103157,Q1033016,Q103343,...,Q978706,Q978857,Q979428,Q979778,Q980143,Q983020,Q983229,Q987741,Q9960,Q999332
rotten_tomatoes_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
m/1022191-twins,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/kingdom_of_the_spiders,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/pump_up_the_volume,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/770817812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/family_man,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/bend_it_like_beckham,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/women_on_the_verge_of_a_nervous_breakdown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/crazy_as_hell,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/starsky_and_hutch,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
m/770676351,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
model = MLPClassifier(hidden_layer_sizes=(100,50,30,20,10),
                     activation='logistic', solver='lbfgs')
model.fit(X_train, y_train)
print(model.score(X_valid, y_valid))

y_predicted = model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.5787060621497707
              precision    recall  f1-score   support

       False       0.58      1.00      0.73      1136
        True       0.00      0.00      0.00       827

   micro avg       0.58      0.58      0.58      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.33      0.58      0.42      1963



  'precision', 'predicted', average, warn_for)


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=0.1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_valid, y_valid))

y_predicted = model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.5797519959232207
0.5787060621497707
              precision    recall  f1-score   support

       False       0.58      1.00      0.73      1136
        True       0.00      0.00      0.00       827

   micro avg       0.58      0.58      0.58      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.33      0.58      0.42      1963



  'precision', 'predicted', average, warn_for)


In [22]:
from sklearn import svm
svm_model = svm.SVC(C=0.0000006)
svm_model.fit(X_train, y_train)
print(svm_model.score(X_train, y_train))
print(svm_model.score(X_valid, y_valid))

y_predicted = svm_model.predict(X_valid)
print(classification_report(y_valid, y_predicted))



0.5797519959232207
0.5787060621497707
              precision    recall  f1-score   support

       False       0.58      1.00      0.73      1136
        True       0.00      0.00      0.00       827

   micro avg       0.58      0.58      0.58      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.33      0.58      0.42      1963



  'precision', 'predicted', average, warn_for)


In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100,
        max_depth=3, min_samples_leaf=10)
rf_model.fit(X_train, y_train)
print(rf_model.score(X_train, y_train))
print(rf_model.score(X_valid, y_valid))

y_predicted = rf_model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.5804314591472737
0.5792154865002547
              precision    recall  f1-score   support

       False       0.58      1.00      0.73      1136
        True       1.00      0.00      0.00       827

   micro avg       0.58      0.58      0.58      1963
   macro avg       0.79      0.50      0.37      1963
weighted avg       0.76      0.58      0.43      1963



In [24]:
# model.predict(categorical_rep_of_cast_in_movies_with_rating.loZc[['m/10011699-leslie_my_name_is_evil'], categorical_rep_of_cast_in_movies_with_rating.columns != 'good'])