In [1]:
# %load my_imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import datetime
import time

import json

In [2]:
# Dict writer n reader
import csv
def write_to_file(name, dict_object):
    with open(name, 'wb') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in dict_object.items():
           writer.writerow([key, value])
        
def load_from_file(file_name):
    dict_object = {}
    df = pd.read_csv(file_name, names = ['key', 'value'])
    for idx in df.index:
        key = df.key[idx]
        value = [int(value) for value in df.value[idx][1:-1].split(',')]
        dict_object.update({key: value})
    return dict_object

In [3]:
movie_actor_dict_int_int = load_from_file('10 networks/movie_actor_dict_int_int.csv')
actor_movie_dict_int_int = load_from_file('10 networks/actor_movie_dict_int_int.csv')

In [None]:
# Constructing actor network edgelist
# Problem 1 recount? Count i only
# Problem 2 recount every encounter
import collections

def build_actor_edgelist(test_dict, actor_dict, movie_dict):
    actor_edge_list = pd.DataFrame()
    for actor_i in test_dict.keys():
        if actor_i%1000 == 0:
            print actor_i
        # i_starred: number of movies actor_i starred in
        i_starred = len(test_dict[actor_i])
        # co_starring: list of actors i has costarred with
        co_starring = [actor_j for movie in test_dict[actor_i] for actor_j in movie_dict[movie]] 
        # remove all actor_i elements
        while actor_i in co_starring: co_starring.remove(actor_i)
        # sort in increasing order (better behavior control)
        co_starring.sort()
        # co_starring_instances: dict_object, count number of each instances
        co_starring_instances = collections.Counter(co_starring)
        # co_starring_set: tuple_object, no duplicates
        co_starring_set = set(co_starring)
        weight = [co_starring_instances[key] / float(i_starred) for key in co_starring_instances.keys()]
        edge_list_i = pd.DataFrame([[actor_i for i in range(len(co_starring_set))], list(co_starring_set), weight]).transpose()
        actor_edge_list = pd.concat([actor_edge_list, edge_list_i])
    return actor_edge_list

In [None]:
# A subset of the dictionary for test purposes
i = 2
spread = 3000
dict_test = dict((key, actor_movie_dict_int_int[key]) for key in range(i, i + spread))

In [None]:
# write in once (much slower than write to multiple files)
str_time= time.time()
actor_edge_list = build_actor_edgelist(actor_movie_dict_int_int, actor_movie_dict_int_int, movie_actor_dict_int_int)
time_spend = float(time.time() - str_time)
print('Time used: %ds' %time_spend)
print('Number of actor involved: %d' %len(actor_movie_dict_int_int))
print('Edge_list_length: %d' %len(actor_edge_list))
print('Average Connection: %d' %(len(actor_edge_list)/len(actor_movie_dict_int_int)))

In [None]:
actor_edge_list.to_csv('actor_network.csv', header=0, index=0)

In [4]:
movie_actor_dict_int_int_trimmed = load_from_file('trimmed_special/movie_actor_dict_int_int_trimmed.csv')
actor_movie_dict_int_int_trimmed = load_from_file('trimmed_special/actor_movie_dict_int_int_trimmed.csv')

In [7]:
# Constructing movie_edge_network
# key: for movies that are deleted yet 
# they still exists in actor files
# how to delete them easily
# ignore such factor, since it counts 
# for only 20%, and they are legitimate
# points, just not in the trimmed dict

def build_movie_edgelist(test_dict, actor_dict, movie_dict):
    movie_edge_list = pd.DataFrame()
    # sort keys to keep it tracked
    key_iter = test_dict.keys()
    key_iter.sort()
    for movie_i in key_iter:
        if movie_i%1000 == 0:
            print movie_i
        # i_population: number of actors in movie_i
        i_population = len(test_dict[movie_i])
        # share_actor: a list of movies that shared actors with movie_i
        share_actor = [movie_j for actor in test_dict[movie_i] for movie_j in actor_dict[actor]]
        # remove_duplicates
        share_actor = set(share_actor)
        # remove movie_i
        share_actor.remove(movie_i)
        jaccard = [len(set(test_dict[movie_i]).intersection(movie_dict[key])) for key in share_actor]
        weight = [value/ float(len(movie_dict[key]) + i_population + value) for key, value in zip(share_actor, jaccard)]
        edge_list_i = pd.DataFrame([[movie_i for i in range(len(share_actor))], list(share_actor), weight]).transpose()
        movie_edge_list = pd.concat([movie_edge_list, edge_list_i])
    return movie_edge_list

In [8]:
import random
rand_key =[movie_actor_dict_int_int_trimmed.keys()[i] for i in sorted(random.sample(xrange(len(movie_actor_dict_int_int_trimmed.keys())), 2000))]
dict_test = dict((key, movie_actor_dict_int_int_trimmed[key]) for key in rand_key)

In [9]:
str_time= time.time()
movie_edge_list = build_movie_edgelist(movie_actor_dict_int_int_trimmed, actor_movie_dict_int_int_trimmed, movie_actor_dict_int_int_trimmed)
time_spend = float(time.time() - str_time)
print('Time used: %ds' %time_spend)

4000
13000
14000
24000
36000
40000
44000
64000
65000
66000
81000
83000
92000
96000
125000
135000
157000
162000
170000
174000
177000
209000
210000
225000
230000
235000
237000
241000
248000
250000
259000
270000
283000
290000
291000
292000
295000
298000
314000
340000
341000
365000
381000
385000
396000
399000
405000
419000
421000
424000
431000
445000
451000
463000
466000
Time used: 2883s


In [25]:
len(movie_actor_dict_int_int_trimmed.keys())

105596

In [11]:
movie_edge_list.to_csv('movie_network_trimmed.csv', header = 0, index = 0)

In [10]:
movie_edge_list.head()

Unnamed: 0,0,1,2
0,2.0,321024.0,0.038462
1,2.0,230403.0,0.041667
2,2.0,37303.0,0.043478
3,2.0,376326.0,0.08
4,2.0,17153.0,0.04
