In [1]:
# %load my_imports.py
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv

import datetime
import time

import json



In [2]:
# Dict writer n reader
def write_to_file(name, dict_object):
    with open(name, 'wb') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in dict_object.items():
           writer.writerow([key, value])
        
def load_from_file(file_name):
    dict_object = {}
    df = pd.read_csv(file_name, names = ['key', 'value'])
    for idx in df.index:
        key = df.key[idx]
        value = [int(value) for value in df.value[idx][1:-1].split(',')]
        dict_object.update({key: value})
    return dict_object

def load_lists(file_name):
    dict_object = {}
    df = pd.read_csv(file_name, names = ['value', 'key'])
    for idx in df.index:
        dict_object.update({df.key[idx]: df.value[idx]})
    return dict_object

def load_lists_reversed(file_name):
    dict_object = {}
    df = pd.read_csv(file_name, names = ['key', 'value'])
    for idx in df.index:
        dict_object.update({df.key[idx]: df.value[idx]})
    return dict_object

def load_pagerank(file_name):
    dict_object = {}
    df = pd.read_csv(file_name, names = ['key', 'value'])
    for idx in df.index:
        dict_object.update({int(df.key[idx]): df.value[idx]})
    return dict_object


In [3]:
# load both index lists
movie_list = load_lists('10 networks/movie_dict.csv')
actor_list = load_lists('10 networks/actor_dict.csv')
movie_actor_dict_int_int = load_from_file('10 networks/movie_actor_dict_int_int_trimmed.csv')
actor_movie_dict_int_int = load_from_file('10 networks/actor_movie_dict_int_int_trimmed.csv')
actor_list_reversed = load_lists_reversed('10 networks/actor_dict.csv')
movie_list_reversed = load_lists_reversed('10 networks/movie_dict.csv')

In [4]:
# Find indexes 
target_movies = ['Batman v Superman: Dawn of Justice (2016)', 'Mission: Impossible - Rogue Nation (2015)', 'Minions (2015)']
target_movie_index = [movie_list_reversed[movie] for movie in target_movies]
target_movie_index

[267804, 415523, 289502]

In [5]:
# load movie ratings
movie_ratings = {}
with open ('movies/movie_rating.txt') as f:
    contents = f.readlines()
for content in contents:
    line = filter(None, re.split(r"[~\t\n]+", content))
    movie_ratings.update({line[0]: float(line[-1])})
    
movie_int_ratings = {}
no_ratings = 0
for key in movie_actor_dict_int_int.keys():
    try:
        movie_int_ratings.update({key: [movie_ratings[movie_list[key]]]})
    except: 
        no_ratings +=1
print(no_ratings)
print(len(movie_actor_dict_int_int))
print(len(movie_int_ratings))

21111
105596
84485


In [6]:
# This is simple and can be partitioned into two steps
# Step 1: create a dict that assigns score to each actor/actress
# actor_rating_counts: number of movie ratings to be counted in actor
# actor list and actor_movie_dict mismatch since I did not generate a trimmed actor/movie list
def construct_actor_ratings(movie_rating_counts, actor_list=actor_list, actor_movie_dict_int_int=actor_movie_dict_int_int,
                           movie_int_ratings=movie_int_ratings):
    actor_ratings = {}
    for actor in actor_list.keys():
        ratings = []
        try:
            for movie in actor_movie_dict_int_int[actor]:
                ratings.append(movie_int_ratings[movie][0])
            if len(ratings) == 1:
                rating_score = ratings[0]
            elif len(ratings) == 0:
                rating_score = 0
            else: 
                ratings.sort()
                rating_score = np.mean(ratings[:movie_rating_counts])
            actor_ratings.update({actor: round(rating_score, 1)})
        except:
            pass
    return actor_ratings

In [7]:
# Step 2 
# movie_rating_counts: number of actors ratings to be counted in movie
def construct_movie_ratings(actor_rating_counts, movie_actor_dict_int_int=movie_actor_dict_int_int,
                           actor_list=actor_list, movie_list=movie_list):
    movie_rating_predicted = {}
    for movie in movie_list.keys():
        ratings = []
        rating_score = 0
        try:
            for actor in movie_actor_dict_int_int[movie]:
                ratings.append(actor_ratings[actor])
            if len(ratings) == 1:
                rating_score = ratings[0]
            elif len(ratings) == 0:
                rating_score = 0
            else: 
                ratings.sort()
                rating_score = np.mean(ratings[:actor_rating_counts])
            movie_rating_predicted.update({movie: round(rating_score, 1)})
        except: pass
    return movie_rating_predicted

In [8]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def simple_analysis(movie_rating_predicted, 
                    movie_int_ratings=movie_int_ratings):
    y_real = []
    y_pred = []
    for movie in movie_int_ratings.keys():
        try:
            y_pred.append(movie_rating_predicted[movie])
            y_real.append(movie_int_ratings[movie])
        except: pass
    mse = mean_squared_error(y_real, y_pred)
    me = np.mean(y_real)-np.mean(y_pred)
    armse = sqrt(mse)
    return mse, me, armse

In [10]:
# pipeline
movie_pipeline = [x + 1 for x in range(10)]
movie_pipeline.append(-1)
actor_pipeline = [x + 1 for x in range(10)]
actor_pipeline.append(-1)
mse_list = []
me_list = []
armse_list = []
min_me = 1000
min_mse = 1000
min_armse = 1000
for i in movie_pipeline:
    for j in actor_pipeline:
        print("movie: %d, actor: %d" %(i, j))
        actor_ratings = construct_actor_ratings(i)   
        movie_rating_predicted = construct_movie_ratings(j)
        mse, me, armse = simple_analysis(movie_rating_predicted)
        mse_list.append(mse)
        me_list.append(me)
        armse_list.append(armse)
        if mse < min_mse:
            mse_track = [i, j]
            min_mse = mse
        if me < min_me:
            me_track = [i, j]
            min_me = me
        if armse < min_armse:
            armse_track = [i, j]
            min_armse = armse

movie: 1, actor: 1
movie: 1, actor: 2
movie: 1, actor: 3
movie: 1, actor: 4
movie: 1, actor: 5
movie: 1, actor: 6
movie: 1, actor: 7
movie: 1, actor: 8
movie: 1, actor: 9
movie: 1, actor: 10
movie: 1, actor: -1
movie: 2, actor: 1
movie: 2, actor: 2
movie: 2, actor: 3
movie: 2, actor: 4
movie: 2, actor: 5
movie: 2, actor: 6
movie: 2, actor: 7
movie: 2, actor: 8
movie: 2, actor: 9
movie: 2, actor: 10
movie: 2, actor: -1
movie: 3, actor: 1
movie: 3, actor: 2
movie: 3, actor: 3
movie: 3, actor: 4
movie: 3, actor: 5
movie: 3, actor: 6
movie: 3, actor: 7
movie: 3, actor: 8
movie: 3, actor: 9
movie: 3, actor: 10
movie: 3, actor: -1
movie: 4, actor: 1
movie: 4, actor: 2
movie: 4, actor: 3
movie: 4, actor: 4
movie: 4, actor: 5
movie: 4, actor: 6
movie: 4, actor: 7
movie: 4, actor: 8
movie: 4, actor: 9
movie: 4, actor: 10
movie: 4, actor: -1
movie: 5, actor: 1
movie: 5, actor: 2
movie: 5, actor: 3
movie: 5, actor: 4
movie: 5, actor: 5
movie: 5, actor: 6
movie: 5, actor: 7
movie: 5, actor: 8
movi

In [18]:
# select 1, 5, 10, all
sample = [0, 4, 9, 10]
x = np.arange(1, 12)
plt.plot()
for i in sample:
    plt.plot(x, mse_list[11*i:11*(i+1)])
plt.legend(['1', '5', '10', 'all'], loc='upper right', title='Number of movie used \nin computing actor score')
plt.xlabel("Number of Actors used in Perdicting Movie Ratings(11 means all)")
plt.ylabel("Mean Square Error")
plt.grid("on")
# plt.show()
plt.savefig('bi01', dpi = 800)
plt.gcf().clear()

In [19]:
# select 1, 5, 10, all
sample = [0, 4, 9, 10]
x = np.arange(1, 12)
plt.plot()
for i in sample:
    plt.plot(x, me_list[11*i:11*(i+1)])
plt.legend(['1', '5', '10', 'all'], loc='upper right', title='Number of movie used \nin computing actor score')
plt.xlabel("Number of Actors used in Perdicting Movie Ratings(11 means all)")
plt.ylabel("Mean Error")
plt.grid("on")
# plt.show()
plt.savefig('bi02', dpi = 800)
plt.gcf().clear()

In [20]:
# select 1, 5, 10, all
sample = [0, 4, 9, 10]
x = np.arange(1, 12)
plt.plot()
for i in sample:
    plt.plot(x, armse_list[11*i:11*(i+1)])
plt.legend(['1', '5', '10', 'all'], loc='upper right', title='Number of movie used \nin computing actor score')
plt.xlabel("Number of Actors used in Perdicting Movie Ratings(11 means all)")
plt.ylabel("Average Root Mean Square Error")
plt.grid("on")
# plt.show()
plt.savefig('bi03', dpi = 800)
plt.gcf().clear()

In [28]:
# length of actors of movie:
for movie in target_movie_index:
    print len(movie_actor_dict_int_int[movie])

66
62
21


In [54]:
# list of target actors:
target_actor = list(set([actor for movie in target_movie_index for actor in movie_actor_dict_int_int[movie]]))

In [80]:
# target_actor_ratings
target_actor_ratings = {}
for actor in target_actor:
    ratings = []
    for movie in actor_movie_dict_int_int[actor]:
        try:
            ratings.append(movie_int_ratings[movie])
        except:
            pass
    if len(ratings) > 0:
        target_actor_ratings.update({actor:np.mean(ratings)})

In [84]:
ratings = []
for movie in target_movie_index:
    rating = []
    for actor in movie_actor_dict_int_int[movie]:
        try: rating.append(target_actor_ratings[actor])
        except: pass
    ratings.append(rating)
    print ('%s' %movie_list[movie])
    print ('length: %d' %len(rating))
    print ('rating: %f' %round(np.mean(rating), 1))

Batman v Superman: Dawn of Justice (2016)
length: 63
rating: 6.300000
Mission: Impossible - Rogue Nation (2015)
length: 62
rating: 6.500000
Minions (2015)
length: 21
rating: 6.800000


In [99]:
# histogram of actor ratings
plt.hist(ratings[0], 63)
plt.hist(ratings[1], 62)
plt.hist(ratings[2], 21)
plt.legend(['Batman v Superman: Dawn of Justice (2016)', 'Mission: Impossible - Rogue Nation (2015)',
            'Minions (2015)'], loc='upper left', fontsize = 7)
plt.xlabel("Number of Actor/Actress")
plt.ylabel("Actor/Actress Ratings")
plt.grid("on")
# plt.show()
plt.savefig('hist', dpi = 800)
plt.gcf().clear()

In [100]:
mse_list[-1]

0.54029411764705881

In [101]:
me_list[-1]

0.19369747899159595

In [103]:
armse_list[-1]

0.7350470173036953