# Imports

In [17]:
import numpy as np
import pandas as pd
import networkx as nx
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
import ast

%matplotlib inline

In [18]:
# change defaults to be less ugly
mpl.rc('xtick', labelsize=14, color="#222222") 
mpl.rc('ytick', labelsize=14, color="#222222") 
mpl.rc('font', **{'family':'sans-serif','sans-serif':['Arial']})
mpl.rc('font', size=16)
mpl.rc('xtick.major', size=6, width=1)
mpl.rc('xtick.minor', size=3, width=1)
mpl.rc('ytick.major', size=6, width=1)
mpl.rc('ytick.minor', size=3, width=1)
mpl.rc('axes', linewidth=1, edgecolor="#222222", labelcolor="#222222")
mpl.rc('text', usetex=False, color="#222222")

In [19]:
# Function Defintions

# Function to plot a degree distribution graph (P(k) vs k graph)
def plot_degree_dist(G):
    
    degrees = [G.degree(n) for n in G.nodes()]
    kmin = min(degrees)
    kmax = max(degrees)
    
    if kmin>0:
        bin_edges = np.logspace(np.log10(kmin), np.log10(kmax)+1, num=20)
    else:
        bin_edges = np.logspace(0, np.log10(kmax)+1, num=20)
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))

    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)
    plt.loglog(x, density, marker='o', linestyle='none')
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')

# 1. Testing the Kevin Bacon Theory

### 1.1 Reading the TMDB 5000 Movie Credits Dataset

This dataset has been collected from [Kaggle](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata).

In [20]:
df = pd.read_csv('tmdb_5000_credits.csv')
df['cast'] = df.cast.apply(ast.literal_eval)
df.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'cast_id': 242, 'character': 'Jake Sully', '...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{'cast_id': 4, 'character': 'Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{'cast_id': 2, 'character': 'Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{'cast_id': 5, 'character': 'John Carter', 'c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


### 1.2 Constructing the network

In [21]:
G = nx.Graph()


In [22]:
# Print the number of nodes and edges
print(G.number_of_nodes())
print(G.number_of_edges())

# Is Actor's network connected graph
# print(nx.is_connected(G))

0
0


In [23]:
# Let's plot the degree distribution

# plot_degree_dist(G)

### 1.3 Randomly select some actors

### 1.4 Test the theory

### 1.5 Plotting the example above

In [29]:
# Try plotting the part of the network tested above

# 2. How does the Actorâ€™s Network compare to Random Networks?

Hint: Since the graph here is not connected you can find the connected components and use the largest component to find the following properties:

* Degree Distribution
* Average Path Length
* Clustering Coefficient

Note: An obvious question would be why only analyzing the largest component is relevant. We'll go into more details on this in the next class on *Scale-Free Networks*

In [24]:
def get_largest_subgraph(G):
    components = nx.connected_components(G)
    components = sorted(components, key=len, reverse=True)
    for c in components:
        G_sub = nx.subgraph(G, c)
        if nx.is_connected(G_sub) == True:
            return G_sub
    
#     Return a null graph as fallback/sanity
    return nx.Graph()

In [25]:
# Get the largest subgraph

# Print the number of nodes and degrees

In [26]:
# Plot the degree distribution

In [27]:
# Calculate the clustering coefficient and average path length



### 2.1 Compare the subgraph above to an ER network of same size

### 2.2 Now compare it to a degree preserving random network

Hint: Use degree-preserving randomisation e.g., double edge swaps similar to the exercises of the *Real World Network* class