In [None]:
!pip install scikit-network
import pandas as pd
import numpy as np
import sknetwork.clustering
import sknetwork.utils
from scipy.sparse import csr_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-network
  Downloading scikit_network-0.26.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.1 MB)
[K     |████████████████████████████████| 8.1 MB 4.3 MB/s 
Collecting scipy>=1.6.3
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.2 MB/s 
[?25hInstalling collected packages: scipy, scikit-network
  Attempting uninstall: scipy
    Found existing installation: scipy 1.4.1
    Uninstalling scipy-1.4.1:
      Successfully uninstalled scipy-1.4.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
Successfully installed scikit-network-0.

In [None]:
# Download the data
!rm -f *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

total 1038848
drwxr-xr-x 1 root root      4096 Jun 22 18:21 .
drwxr-xr-x 1 root root      4096 Jun 22 18:19 ..
drwxr-xr-x 4 root root      4096 Jun 15 13:41 .config
-rw-r--r-- 1 root root 229323596 Jun 21 13:21 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Jun 15 13:42 sample_data
-rw-r--r-- 1 root root 274787534 Jun 21 13:21 title.akas.tsv.gz
-rw-r--r-- 1 root root 157548376 Jun 21 13:21 title.basics.tsv.gz
-rw-r--r-- 1 root root 402098571 Jun 21 13:21 title.principals.tsv.gz


In [None]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False).set_index('tconst')[['titleType', 'primaryTitle', 'startYear']]
title.head()

Unnamed: 0_level_0,titleType,primaryTitle,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0000001,short,Carmencita,1894
tt0000002,short,Le clown et ses chiens,1892
tt0000003,short,Pauvre Pierrot,1892
tt0000004,short,Un bon bock,1892
tt0000005,short,Blacksmith Scene,1893


In [None]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz', sep='\t')[['tconst', 'nconst', 'category']]
# Only consider actors, not directors, composers, etc. Shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [None]:
# Only consider movies, not TV series, etc. Shrinks data to ~5%
movies = title[title['titleType'] == 'movie']
cast = cast[cast['tconst'].isin(movies.index)]
# This is what the network looks like
cast.head()

Unnamed: 0,tconst,nconst,category
850,tt0000502,nm0215752,actor
851,tt0000502,nm0252720,actor
1042,tt0000574,nm0846887,actress
1043,tt0000574,nm0846894,actor
1044,tt0000574,nm1431224,actor


In [None]:
# Explore the regions we have data for (e.g. IN, US, etc)
region = pd.read_csv('title.akas.tsv.gz', sep='\t', low_memory=False).set_index('titleId')['region']
region.value_counts().head(10)

In [None]:
import pandas as pd

In [None]:
# Load the name data along with birth year
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]
name_freq = cast['nconst'].value_counts()

In [None]:
def get_pairs(lang=None, min_acted=25, min_pairings=4):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [None]:
pairs_in, cat_in = get_pairs(lang='IN', min_acted=3, min_pairings=1)
lookup(pairs_in, cat_in)

In [None]:
pairs_us, cat_us = get_pairs(lang='US', min_acted=3, min_pairings=1)
lookup(pairs_us, cat_us)

In [None]:
pairs_in

In [None]:
algo = sknetwork.clustering.Louvain()
adjacency = sknetwork.utils.edgelist2adjacency(pairs_in)
labels = algo.fit_transform(adjacency)
clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster')], axis=1)

clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster'),
    pd.Series(clusters_in['index'].map(name_freq), name='freq'),
], axis=1)
clusters_in


In [None]:
clusters_in[clusters_in['cluster']==0].sort_values('freq', ascending=False).head(20)

In [None]:
clusters_in[clusters_in['cluster']==1].sort_values('freq', ascending=False).head(20)

In [None]:
clusters_in[clusters_in['cluster']==2].sort_values('freq', ascending=False).head(20)

In [None]:
clusters_in[clusters_in['cluster']==3].sort_values('freq', ascending=False).head(20)

In [None]:
def connectedness(clusters, pairs, cat):
    pairs['rowcluster'] = clusters.iloc[pairs.row].cluster.reset_index(drop=True)
    pairs['colcluster'] = clusters.iloc[pairs.col].cluster.reset_index(drop=True)

    connectedness, coclusters = {}, {}
    for index, costars in pairs.groupby('row'):
        coclusters[cat.index[index]] = clusterdist = costars.groupby('colcluster')['n'].sum()
        selfcluster = costars.rowcluster.iloc[0]
        connectedness[cat.index[index]] = {
            'primaryName': cat.primaryName.iloc[index],
            # Which cluster do they belong to
            'cluster': selfcluster,
            # No of clusters they've acted at least 5 times with
            'nclusters': (clusterdist >= 5).sum(),
            # No of films they've acted in
            'titles': name_freq[cat.index[index]],
            # No of pairings they've had with other stars
            'pairings': clusterdist.sum(),
            # % of films within cluster
            'incluster': clusterdist.get(selfcluster, 0) / clusterdist.sum()
        }
    coclusters = pd.DataFrame(coclusters).T
    connectedness = pd.DataFrame(connectedness).T
    return connectedness.sort_values('incluster'), coclusters

In [None]:
connected_in, coclusters_in = connectedness(clusters_in, pairs_in, cat_in)

In [None]:
# Who are the big crossover actors in IN?
connected_in[connected_in['titles'] > 50].sort_values('incluster').head(20)