In [6]:
import pandas as pd
import numpy as np
from itertools import combinations 
import networkx as nx
import time

In [7]:
title_akas = pd.read_csv('title.akas.tsv', sep='\t', index_col = 'titleId', usecols=['titleId', 'language','title', 'isOriginalTitle'], low_memory=False)
req = (set(title_akas[(title_akas['language']=='en')].index))
title_basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, 
                           usecols=['tconst','titleType', 'primaryTitle', 'genres'], index_col= 'tconst')



In [8]:
#Featurize genres
node_data = title_basics.reindex(req)
node_data['genres'] = node_data['genres'].str.split(',')   #Convert genres to lists
node_data.dropna(inplace=True)   #Remove the one row with no data
genres_edge = pd.Series(node_data['genres']).apply(frozenset).to_frame(name='genre')
for genre in frozenset.union(*genres_edge.genre):
    genres_edge[genre] = genres_edge.apply(lambda _: int(genre in _.genre), axis=1)


In [None]:
genres_for_edges = ['Mystery',
 'Thriller',
 'Biography',
 'Talk-Show',
 'Film-Noir',
 '\\N',
 'Music',
 'Musical',
 'Adventure',
 'Game-Show',
 'News',
 'Western',
 'Sci-Fi',
 'Sport',
 'History',
 'Fantasy',
 'Reality-TV',
 'War',
 'Family',
 'Crime',
 'Adult',
 'Horror',
 'Animation']
num=0
save=0
edges=set()
for a in genres_for_edges:
    t0 = time.time()
    for x,y in combinations(genres_edge[genres_edge[a]==1].index, 2):
        num += 1
        if ((x,y) not in edges):
            edges.add((x,y))
        else:
            save += 1
    t1 = time.time()
    print (a, ":" ,sum(genres_edge[a]), ":", t1-t0)
    
num_combinations = num #Number of possible combinations explored
num_unique_edges = len(edges) #Number of unique edges
print (num_combinations - num_unique_edges)
print (num_unique_edges)
print ("if (y,x) save ",save)
a = len(node_data.index)
print (a * (a - 1)) #Max number of edges possible in the graph

Mystery : 3424 : 4.6885621547698975
Thriller : 5631 : 13.672004699707031
Biography : 2392 : 4.525630235671997
Talk-Show : 247 : 0.02549290657043457
Film-Noir : 330 : 0.04189801216125488
\N : 9600 : 43.965028047561646
Music : 1764 : 1.5451488494873047
Musical : 1539 : 0.9251368045806885
Adventure : 7326 : 56.161131858825684
Game-Show : 130 : 0.14443397521972656
News : 382 : 0.3527989387512207
Western : 1036 : 1.710231065750122
Sci-Fi : 2401 : 10.55589509010315
Sport : 1087 : 2.410902738571167
History : 3032 : 15.853185176849365
Fantasy : 4117 : 42.37049126625061
Reality-TV : 282 : 0.4117770195007324
War : 2667 : 15.459173917770386
Family : 4258 : 48.782723903656006
Crime : 7925 : 194.31944394111633
Adult : 1768 : 20.21605920791626
Horror : 4533 : 139.26352381706238


In [5]:
node_data.head()  #Node data

Unnamed: 0_level_0,titleType,primaryTitle,genres
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0195234,movie,Saving Grace,"[Comedy, Crime]"
tt2215289,videoGame,Major Havoc,"[Action, Sci-Fi]"
tt0168690,movie,A Free Woman,[Drama]
tt0366997,movie,Mens' Companion,[\N]
tt1068962,movie,The Two Deaths of Quincas Wateryell,"[Comedy, Drama]"


In [6]:
list(edges)[:5] #Edge data

[('tt0040823', 'tt4108044'),
 ('tt1837492', 'tt0852963'),
 ('tt0184167', 'tt1415872'),
 ('tt4247628', 'tt6819596'),
 ('tt0391502', 'tt1667150')]

In [7]:
#Begin Graph

G = nx.Graph()
G.clear()
for index,row in node_data.iterrows():
    G.add_node(index, data=row['primaryTitle'])
for x in list(edges):
    G.add_edge (x[0],x[1])

In [39]:
genre_count = {}
for x in genres_edge.columns.values:
    genre_count[x] = sum(genres_edge[x])
genre_count

{'Mystery': 3424,
 'Thriller': 5631,
 'Biography': 2392,
 'Action': 12226,
 'Talk-Show': 247,
 'Film-Noir': 330,
 '\\N': 9600,
 'Music': 1764,
 'Musical': 1539,
 'Adventure': 7326,
 'Drama': 46180,
 'Game-Show': 130,
 'Documentary': 15096,
 'News': 382,
 'Short': 18917,
 'Western': 1036,
 'Sci-Fi': 2401,
 'Sport': 1087,
 'Romance': 10265,
 'History': 3032,
 'Fantasy': 4117,
 'Reality-TV': 282,
 'War': 2667,
 'Family': 4258,
 'Crime': 7925,
 'Comedy': 20218,
 'Adult': 1768,
 'Horror': 4533,
 'Animation': 7075}

In [27]:
sum(genres_edge.loc['tt0056014'])

2