In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import itertools

In [2]:
#__________________
TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES = {
    'budget': 'budget',
    'genres': 'genres',
    'revenue': 'gross',
    'title': 'movie_title',
    'runtime': 'duration',
    'original_language': 'language',  # it's possible that spoken_languages would be a better match
    'keywords': 'plot_keywords',
    'vote_count': 'num_voted_users'}
def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    df.drop("homepage",axis=1,inplace=True)
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#____________________________
def load_tmdb_credits(path):
    df = pd.read_csv(path)
    json_columns = ['cast', 'crew']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df
#_______________________________________
def safe_access(container, index_values):
    result = container
    try:
        for idx in index_values:
            result = result[idx]
        return result
    except IndexError or KeyError:
        return pd.np.nan
#_______________________________________     
IMDB_COLUMNS_TO_REMAP = {'imdb_score': 'vote_average'}
#_______________________________________
def get_director(crew_data):
    directors = [x['name'] for x in crew_data if x['job'] == 'Director']
    return safe_access(directors, [0])
#_______________________________________
def get_actors(cast_data):
    actors=[]
    for i in range(len(cast_data)):
        actors.append(safe_access(cast_data, [i, 'name']))
    return actors
#_______________________________________
def pipe_flatten_names(keywords):
    return ','.join([x['name'] for x in keywords]).split(",")
#_______________________________________
def convert_to_original_format(movies, credits):
    tmdb_movies = movies.copy()
    tmdb_movies.rename(columns=TMDB_TO_IMDB_SIMPLE_EQUIVALENCIES, inplace=True)
    tmdb_movies['title_year'] = pd.to_datetime(tmdb_movies['release_date']).apply(lambda x: x.year)
    # I'm assuming that the first production country is equivalent, but have not been able to validate this
    tmdb_movies['country'] = tmdb_movies['production_countries'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['language'] = tmdb_movies['spoken_languages'].apply(lambda x: safe_access(x, [0, 'name']))
    tmdb_movies['director_name'] = credits['crew'].apply(get_director)
    tmdb_movies['actors'] = credits['cast'].apply(get_actors)
    tmdb_movies['actor_1_name'] = credits['cast'].apply(lambda x: safe_access(x, [1, 'name']))
    tmdb_movies['actor_2_name'] = credits['cast'].apply(lambda x: safe_access(x, [2, 'name']))
    tmdb_movies['actor_3_name'] = credits['cast'].apply(lambda x: safe_access(x, [3, 'name']))
    tmdb_movies['genres'] = tmdb_movies['genres'].apply(pipe_flatten_names)
    tmdb_movies['plot_keywords'] = tmdb_movies['plot_keywords'].apply(pipe_flatten_names)
    return tmdb_movies

In [3]:
credits=load_tmdb_credits("tmdb_5000_credits.csv")
movies=load_tmdb_movies("tmdb_5000_movies.csv")

In [4]:
df = convert_to_original_format(movies, credits)



In [25]:
df[["budget","genres","plot_keywords","original_title","overview","popularity","production_companies","actors"]]

Unnamed: 0,budget,genres,plot_keywords,original_title,overview,popularity,production_companies,actors
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...",Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{'name': 'Ingenious Film Partners', 'id': 289...","[Sam Worthington, Zoe Saldana, Sigourney Weave..."
1,300000000,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{'name': 'Walt Disney Pictures', 'id': 2}, {'...","[Johnny Depp, Orlando Bloom, Keira Knightley, ..."
2,245000000,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...",Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R..."
3,250000000,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...",The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{'name': 'Legendary Pictures', 'id': 923}, {'...","[Christian Bale, Michael Caine, Gary Oldman, A..."
4,260000000,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...",John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[Taylor Kitsch, Lynn Collins, Samantha Morton,..."
...,...,...,...,...,...,...,...,...
4798,220000,"[Action, Crime, Thriller]","[united states–mexico barrier, legs, arms, pap...",El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{'name': 'Columbia Pictures', 'id': 5}]","[Carlos Gallardo, Jaime de Hoyos, Peter Marqua..."
4799,9000,"[Comedy, Romance]",[],Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],"[Edward Burns, Kerry Bishé, Marsha Dietlein, C..."
4800,0,"[Comedy, Drama, Romance, TV Movie]","[date, love at first sight, narration, investi...","Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{'name': 'Front Street Pictures', 'id': 3958}...","[Eric Mabius, Kristin Booth, Crystal Lowe, Geo..."
4801,0,[],[],Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[Daniel Henney, Eliza Coupe, Bill Paxton, Alan..."


In [6]:
print("Oldest movie:",min(df.title_year))
print("Newest movie:",max(df.title_year))
print(len(df.title_year))
print(len(df.title_year[df.title_year<2015]))

Oldest movie: 1916.0
Newest movie: 2017.0
4803
4481


In [7]:
def actor_list(actors):
    r=range(len(actors))
    actor_list=[]
    for i in r:
        for j in actors[i]:
            actor_list.append(j)
    return list(set(actor_list))
actor_list=actor_list(df.actors)

In [8]:
actor_list

['Reece Thompson',
 'Ellen Greene',
 'Mark Blankfield',
 'Joseph Carberry',
 'Rory Mallinson',
 'Greg Tiernan',
 'Eva Marie Saint',
 'Jerod Mixon',
 'Jake Short',
 'John Leonhardt',
 'Cassandra Scerbo',
 'Keenan Wynn',
 'John David Bland',
 'Luke Adams',
 'Steve Fletcher',
 'Laura Orrico',
 'Jennifer Rubin',
 'Tom Coleman',
 'Steve Furst',
 'Matthew Ferguson',
 'Kenneth More',
 'Matteo Spears Satriano',
 'David Fine',
 'Brenda Vaccaro',
 'Buddy Bolton',
 'Joanna Merlin',
 'Dick Durock',
 'Oscar Pearce',
 'Alison Chand',
 'Paul Benedict',
 'Ellen Jacoby',
 'Eve Macklin',
 'Prestin Persson',
 'Shelley Hennig',
 'Claire Adams',
 'Hope Alexander-Willis',
 'Anna Mucha',
 'Micah Berry',
 'Carl M. Leviness',
 'Holly Robinson Peete',
 'Florence Henderson',
 'Selton Mello',
 'Nate Dushku',
 'Tony Leung Ka-Fai',
 'Giuseppe Andrews',
 'Conor Romero',
 'Corey Page',
 'Shannon Maliff',
 'Michael Jonsson',
 'J.P. Luebsen',
 'Manoela Scarpa Saldanha',
 'Dwight Sora',
 'Gwyn LaRee',
 'Julie Lauren',
 

In [9]:
def movie_list(actors,al):
    r=range(len(actors))
    movie_list={}
    for i in r:
        for j in actors[i]:
            try:
                movie_list[j].append(i)
            except:
                movie_list[j]=[i]
    return movie_list
movie_list=movie_list(df.actors,actor_list)

In [10]:
movie_list

{'Sam Worthington': [0, 43, 132, 206, 599, 671, 787, 1146, 1448, 2147],
 'Zoe Saldana': [0,
  47,
  56,
  94,
  158,
  199,
  521,
  942,
  1178,
  1245,
  1372,
  1503,
  1873,
  1892,
  2060,
  2106,
  2350,
  2827,
  3044,
  3474],
 'Sigourney Weaver': [0,
  57,
  148,
  157,
  562,
  734,
  740,
  838,
  1008,
  1029,
  1053,
  1178,
  1217,
  1309,
  1425,
  1574,
  1605,
  1804,
  2103,
  2138,
  2153,
  2210,
  2244,
  2361,
  2391,
  2403,
  2500,
  2778,
  3056,
  3105,
  3158,
  3739,
  4703],
 'Stephen Lang': [0, 280, 855, 863, 1099, 1580, 1836, 1922, 1972, 3204, 4783],
 'Michelle Rodriguez': [0,
  44,
  99,
  177,
  204,
  574,
  582,
  657,
  1395,
  1602,
  1939,
  2229,
  2803],
 'Giovanni Ribisi': [0,
  273,
  280,
  448,
  484,
  544,
  607,
  616,
  628,
  747,
  903,
  973,
  1229,
  1853,
  2115,
  3173,
  3293,
  3321,
  3479,
  3738,
  3869,
  4010,
  4212,
  4619],
 'Joel David Moore': [0, 725, 1548, 1911, 3620, 4237],
 'CCH Pounder': [0, 296, 409, 442, 777, 2046

In [11]:
def actor_link(actors):
    r=range(len(actors))
    links=[]
    for i in r:
        try:
            l=list(itertools.combinations(actors[i], 2))
            for e in l:
                links.append(e)
        except:
            print("error")
    return links
links=actor_link(df.actors)

In [12]:
links

[('Sam Worthington', 'Zoe Saldana'),
 ('Sam Worthington', 'Sigourney Weaver'),
 ('Sam Worthington', 'Stephen Lang'),
 ('Sam Worthington', 'Michelle Rodriguez'),
 ('Sam Worthington', 'Giovanni Ribisi'),
 ('Sam Worthington', 'Joel David Moore'),
 ('Sam Worthington', 'CCH Pounder'),
 ('Sam Worthington', 'Wes Studi'),
 ('Sam Worthington', 'Laz Alonso'),
 ('Sam Worthington', 'Dileep Rao'),
 ('Sam Worthington', 'Matt Gerald'),
 ('Sam Worthington', 'Sean Anthony Moran'),
 ('Sam Worthington', 'Jason Whyte'),
 ('Sam Worthington', 'Scott Lawrence'),
 ('Sam Worthington', 'Kelly Kilgour'),
 ('Sam Worthington', 'James Patrick Pitt'),
 ('Sam Worthington', 'Sean Patrick Murphy'),
 ('Sam Worthington', 'Peter Dillon'),
 ('Sam Worthington', 'Kevin Dorman'),
 ('Sam Worthington', 'Kelson Henderson'),
 ('Sam Worthington', 'David Van Horn'),
 ('Sam Worthington', 'Jacob Tomuri'),
 ('Sam Worthington', 'Michael Blain-Rozgay'),
 ('Sam Worthington', 'Jon Curry'),
 ('Sam Worthington', 'Luke Hawker'),
 ('Sam Worth

In [13]:
import networkx as nx
G = nx.Graph()
G.add_nodes_from(actor_list)
G.add_edges_from(links)

In [None]:
import matplotlib.pyplot as plt
#nx.draw(G, with_labels=True, font_weight='bold')
#plt.show()

In [15]:
nx.write_gexf(G, "actors.gexf")

In [16]:
print(len(df.actors[df.title_year<2000]))

1308


In [27]:
actors_year=df.actors[df.title_year<1985].reset_index(drop=True)
def make_actor_link_small(actors):
    r=range(len(actors))
    links=[]
    for i in r:
        try:
            l=list(itertools.combinations(actors[i], 2))
            for e in l:
                links.append(e)
        except:
            print("error")
    return links
links_small=make_actor_link_small(actors_year)
print(len(links_small))

def make_actor_list_small(actors):
    r=range(len(actors))
    actor_list=[]
    for i in r:
        for j in actors[i]:
            actor_list.append(j)
    return list(set(actor_list))
actor_list_small=make_actor_list_small(actors_year)

G = nx.Graph()
G.add_nodes_from(actor_list_small)
G.add_edges_from(links_small)
nx.write_gexf(G, "actors_tiny_1985.gexf")

215985


In [32]:
G = nx.MultiGraph()
for i in range(1,5):
    actors_year=df.actors[df.title_year>2000][df.title_year<2000+i].reset_index(drop=True)
    def make_actor_link_small(actors):
        r=range(len(actors))
        links=[]
        for i in r:
            try:
                l=list(itertools.combinations(actors[i], 2))
                for e in l:
                    e=(e[0],e[1],"timestamp="+str(i))
                    links.append(e)
            except:
                print("error")
        return links
    links_small=make_actor_link_small(actors_year)
    print(len(links_small))

    def make_actor_list_small(actors):
        r=range(len(actors))
        actor_list=[]
        for i in r:
            for j in actors[i]:
                actor_list.append(j)
        return list(set(actor_list))
    actor_list_small=make_actor_list_small(actors_year)

    G.add_nodes_from(actor_list_small)
    G.add_edges_from(links_small)
    
nx.write_gexf(G, "actors_2000.gexf")

0
75460
136849
195594


In [18]:
n=["Max M.","Joseph L.","Katy L.","Marco R.","Susan S."]
c=[["Joseph L."],["Max M.","Katy L.","Susan S."],["Joseph L."],[],["Joseph L."]]
u=["Leiden","Leiden","Maastircht","Cambridge","Amsterdam"]
g=[0,1,1,0,1]
l=["english","dutch","english","english","dutch"]
a=[23,22,21,20,22]
data = {'Name':  n,
        'Cooperation': c,
        'Bachelor': u,
        'Grade above 8': g,
        'Language': l,
        'Age': a}

df_students = pd.DataFrame (data, columns = ['Name','Cooperation','Bachelor','Grade above 8','Language','Age'])
df_students

Unnamed: 0,Name,Cooperation,Bachelor,Grade above 8,Language,Age
0,Max M.,[Joseph L.],Leiden,0,english,23
1,Joseph L.,"[Max M., Katy L., Susan S.]",Leiden,1,dutch,22
2,Katy L.,[Joseph L.],Maastircht,1,english,21
3,Marco R.,[],Cambridge,0,english,20
4,Susan S.,[Joseph L.],Amsterdam,1,dutch,22


In [19]:
students=pd.read_csv("full_school.csv","\t",names=["t","ci","cj","Ci","Cj"], header=None)

In [20]:
students

Unnamed: 0,t,ci,cj,Ci,Cj
0,31220,1558,1567,3B,3B
1,31220,1560,1570,3B,3B
2,31220,1567,1574,3B,3B
3,31220,1632,1818,4B,4B
4,31220,1632,1866,4B,4B
...,...,...,...,...,...
106712,137100,1579,1709,3B,Teachers
106713,137100,1579,1719,3B,3A
106714,137100,1592,1835,5B,4B
106715,137100,1594,1700,3B,3B


In [21]:
def make_edge_list(df):
    edges=[]
    for i in range(len(df)):
        edges.append((df.ci[i],df.cj[i]))
    return edges
def make_node_list(df):
    students=[]
    for i in range(len(df)):
        students.append(df.ci[i])
        students.append(df.cj[i])
    return list(set(students))
N=make_node_list(students)
V=make_edge_list(students)

In [22]:
print(len(N))
print(len(V))

242
106717


In [23]:
G = nx.Graph()
G.add_nodes_from(N)
G.add_edges_from(V)
nx.write_gexf(G, "students.gexf")