In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

wikidata = pd.read_json('wikidata-movies.json.gz', orient='record', lines=True, encoding="utf8")
rotten_tomato = pd.read_json('rotten-tomatoes.json.gz', orient='record', lines=True)


In [2]:
# Here we will consider a movie good if it has a critic rating > 80% on rotten tomatoes.
# Drop all records with sub 30% critic score
rotten_tomato = rotten_tomato[rotten_tomato['critic_percent'] >= 30]


rotten_tomato = rotten_tomato[['rotten_tomatoes_id', 'critic_percent']]
rotten_tomato['good'] = (rotten_tomato['critic_percent'] >= 80)
rotten_tomato = rotten_tomato.drop(columns=['critic_percent'])
rotten_tomato = rotten_tomato.set_index('rotten_tomatoes_id')


In [4]:
wikidata_with_cast = wikidata[wikidata.cast_member.notna()]
wikidata_with_cast = wikidata_with_cast[wikidata_with_cast.cast_member.map(len)>3]

In [5]:
cast_members_by_movie = wikidata_with_cast[['cast_member', 'rotten_tomatoes_id']]

In [7]:
# https://www.mikulskibartosz.name/how-to-split-a-list-inside-a-dataframe-cell-into-rows-in-pandas/
cast_members_by_movie = cast_members_by_movie.cast_member.apply(pd.Series) \
    .merge(cast_members_by_movie, left_index = True, right_index = True) \
    .drop(["cast_member"], axis = 1) \
    .melt(id_vars = ['rotten_tomatoes_id'], value_name = "cast_member") \
    .drop('variable', axis = 1) \
    .dropna()

In [9]:
cast_members_by_movie = cast_members_by_movie.sort_values(['rotten_tomatoes_id'], ascending=True).set_index('rotten_tomatoes_id')

In [10]:
cast_members_by_movie_with_rating = rotten_tomato.join(cast_members_by_movie)

In [11]:
cast_members_by_movie_with_rating = cast_members_by_movie_with_rating.dropna()

In [13]:
categorical_rep_of_cast_in_movies = pd.get_dummies(cast_members_by_movie_with_rating['cast_member'])
categorical_rep_of_cast_in_movies = categorical_rep_of_cast_in_movies.drop([col for col, val in categorical_rep_of_cast_in_movies.sum().iteritems() if val < 10], axis=1)

In [15]:
categorical_rep_of_cast_in_movies = categorical_rep_of_cast_in_movies.groupby('rotten_tomatoes_id').any().astype(int)

In [16]:
categorical_rep_of_cast_in_movies_with_rating = categorical_rep_of_cast_in_movies.join(rotten_tomato)

In [17]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(categorical_rep_of_cast_in_movies_with_rating.loc[:, categorical_rep_of_cast_in_movies_with_rating.columns != 'good'], categorical_rep_of_cast_in_movies_with_rating['good'])

In [20]:
model = MLPClassifier(hidden_layer_sizes=(100,50,30,20,10),
                     activation='logistic', solver='lbfgs')
model.fit(X_train, y_train)
print(model.score(X_valid, y_valid))

y_predicted = model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.5878757004584819
              precision    recall  f1-score   support

       False       0.59      1.00      0.74      1154
        True       0.00      0.00      0.00       809

   micro avg       0.59      0.59      0.59      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.35      0.59      0.44      1963



  'precision', 'predicted', average, warn_for)


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=50, min_samples_leaf=0.1)
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_valid, y_valid))

y_predicted = model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.5766944114149821
0.5878757004584819
              precision    recall  f1-score   support

       False       0.59      1.00      0.74      1154
        True       0.00      0.00      0.00       809

   micro avg       0.59      0.59      0.59      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.35      0.59      0.44      1963



  'precision', 'predicted', average, warn_for)


In [22]:
from sklearn import svm
svm_model = svm.SVC(C=0.0000006)
svm_model.fit(X_train, y_train)
print(svm_model.score(X_train, y_train))
print(svm_model.score(X_valid, y_valid))

y_predicted = svm_model.predict(X_valid)
print(classification_report(y_valid, y_predicted))



0.5766944114149821
0.5878757004584819
              precision    recall  f1-score   support

       False       0.59      1.00      0.74      1154
        True       0.00      0.00      0.00       809

   micro avg       0.59      0.59      0.59      1963
   macro avg       0.29      0.50      0.37      1963
weighted avg       0.35      0.59      0.44      1963



  'precision', 'predicted', average, warn_for)


In [23]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100,
        max_depth=300, min_samples_leaf=10)
rf_model.fit(X_train, y_train)

print(rf_model.score(X_train, y_train))
print(rf_model.score(X_valid, y_valid))

y_predicted = rf_model.predict(X_valid)
print(classification_report(y_valid, y_predicted))

0.6252760319347715
0.6031584309730005
              precision    recall  f1-score   support

       False       0.60      0.96      0.74      1154
        True       0.62      0.10      0.17       809

   micro avg       0.60      0.60      0.60      1963
   macro avg       0.61      0.53      0.45      1963
weighted avg       0.61      0.60      0.50      1963

