In [2]:
import pandas as pd
import numpy as np
from itertools import combinations 
import networkx as nx
import time

In [3]:
title_akas = pd.read_csv('title.akas.tsv', sep='\t', index_col = 'titleId', usecols=['titleId', 'language','title', 'isOriginalTitle'], low_memory=False)
req = (set(title_akas[(title_akas['language']=='en')].index))
title_basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False, 
                           usecols=['tconst','titleType', 'primaryTitle', 'genres'], index_col= 'tconst')



In [4]:
#Featurize genres
node_data = title_basics.reindex(req)
node_data['genres'] = node_data['genres'].str.split(',')   #Convert genres to lists
node_data.dropna(inplace=True)   #Remove the one row with no data
genres_edge = pd.Series(node_data['genres']).apply(frozenset).to_frame(name='genre')
for genre in frozenset.union(*genres_edge.genre):
    genres_edge[genre] = genres_edge.apply(lambda _: int(genre in _.genre), axis=1)


In [5]:
#Create Graph and add nodes
G = nx.Graph()
for index,row in node_data.iterrows():
    G.add_node(index, data=row['primaryTitle'])

In [6]:
genres_for_edges = [ 'Game-Show',
 'Talk-Show',
 'Reality-TV',
 'Film-Noir',
 'News',
 'Western',
 'Sport',
 'Musical',
 'Music',
 'Adult',
 'Biography',
 'Sci-Fi',
 'War',
 'History',
'Mystery',
 'Fantasy',
 'Family',
 'Horror',
 'Thriller',
 'Animation',
 'Adventure',
 'Crime',
 '\\N',
 'Romance']
num=0
edges = set()
for a in genres_for_edges:
    t0 = time.time()
    for x,y in combinations(genres_edge[genres_edge[a]==1].index, 2):
        num += 1
        edges.add((x,y))
    t1 = time.time()
    print (a, ":" ,sum(genres_edge[a]), ":", t1-t0, ':', t1)
    
print (len(G.edges))
# num_unique_edges = len(edges) #Number of unique edges
# print (num_combinations - num_unique_edges)
# print (num_unique_edges)
a = len(node_data.index)
print (a * (a - 1)) #Max number of edges possible in the graph

Game-Show : 130 : 0.03185725212097168 : 1549070901.194689
Talk-Show : 247 : 0.017859220504760742 : 1549070901.21523
Reality-TV : 282 : 0.0237729549407959 : 1549070901.241808
Film-Noir : 330 : 0.034416913986206055 : 1549070901.279688
News : 382 : 0.0540318489074707 : 1549070901.336573
Western : 1036 : 0.3412618637084961 : 1549070901.680456
Sport : 1087 : 0.3640720844268799 : 1549070902.04706
Musical : 1539 : 0.8087611198425293 : 1549070902.858763
Music : 1764 : 1.0808608531951904 : 1549070903.9424179
Adult : 1768 : 1.420867681503296 : 1549070905.36596
Biography : 2392 : 1.9522721767425537 : 1549070907.320854
Sci-Fi : 2401 : 2.931020975112915 : 1549070910.254596
War : 2667 : 2.295253038406372 : 1549070912.552657
History : 3032 : 3.429047107696533 : 1549070915.98417
Mystery : 3424 : 6.6676061153411865 : 1549070922.6547441
Fantasy : 4117 : 6.073134899139404 : 1549070928.7346718
Family : 4258 : 6.918212175369263 : 1549070935.6558
Horror : 4533 : 13.90179705619812 : 1549070949.560142
Thrille

In [7]:
len(edges)
# for index, row in node_data.iterrows():
    

245374025

In [8]:
genres_for_edges = [  'Action',
 'Documentary',
 'Short']
num=0
for a in genres_for_edges:
    t0 = time.time()
    for x,y in combinations(genres_edge[genres_edge[a]==1].index, 2):
        num += 1
        edges.add((x,y))
    t1 = time.time()
    print (a, ":" ,sum(genres_edge[a]), ":", t1-t0, ':', t1)


KeyboardInterrupt: 

In [7]:
#Begin Graph

for x in list(edges):
    G.add_edge (x[0],x[1])

In [10]:
T = nx.Graph()
T.clear()
T.add_node(1)
T.add_node(2)
T.add_node(3)
T.add_edge(1,2)
T.add_edge(1,2)
T.add_edge(1,3)
len(T.edges)

2