In [1]:
!pip install scikit-network==0.24.0
import pandas as pd
import numpy as np
import sknetwork.clustering
import sknetwork.utils
from scipy.sparse import csr_matrix

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-network==0.24.0
  Downloading scikit_network-0.24.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 32.8 MB/s 
Installing collected packages: scikit-network
Successfully installed scikit-network-0.24.0


In [2]:
# Download the data
!rm -f *.tsv.gz
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!ls -la

total 1059772
drwxr-xr-x 1 root root      4096 Oct 18 15:13 .
drwxr-xr-x 1 root root      4096 Oct 18 15:12 ..
drwxr-xr-x 4 root root      4096 Oct 14 19:04 .config
-rw-r--r-- 1 root root 235082849 Oct 18 13:23 name.basics.tsv.gz
drwxr-xr-x 1 root root      4096 Oct 14 19:05 sample_data
-rw-r--r-- 1 root root 284773528 Oct 18 13:23 title.akas.tsv.gz
-rw-r--r-- 1 root root 151270330 Oct 17 13:24 title.basics.tsv.gz
-rw-r--r-- 1 root root 414049088 Oct 17 13:24 title.principals.tsv.gz


In [3]:
# Load the titles
title = pd.read_csv('title.basics.tsv.gz', sep='\t',usecols=['tconst','titleType', 'startYear'], dtype={
    'tconst':'str',
    'titleType':'str',
    'primaryTitle':'str',
    'startYear': 'Int64'
}, na_values='\\N').set_index('tconst')

title.head()

Unnamed: 0_level_0,titleType,startYear
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0000001,short,1894
tt0000002,short,1892
tt0000003,short,1892
tt0000004,short,1892
tt0000005,short,1893


In [4]:
# Load the cast of each film
cast = pd.read_csv('title.principals.tsv.gz',usecols=['tconst', 'nconst', 'category'], sep='\t')
# Only consider actors; shrinks data to about 40%
cast = cast[cast.category.isin({'actor', 'actress'})]
cast.head()

Unnamed: 0,tconst,nconst,category
11,tt0000005,nm0443482,actor
12,tt0000005,nm0653042,actor
16,tt0000007,nm0179163,actor
17,tt0000007,nm0183947,actor
21,tt0000008,nm0653028,actor


In [5]:
# Only consider movies
movies = title[(title['titleType'] == 'movie') & (title.startYear >2004)]
cast = cast[cast['tconst'].isin(movies.index)]

cast.head()

Unnamed: 0,tconst,nconst,category
80688,tt0011801,nm0459029,actor
80689,tt0011801,nm0681726,actor
80690,tt0011801,nm0692612,actress
80691,tt0011801,nm0726256,actor
80692,tt0011801,nm0776458,actor


In [6]:
region = pd.read_csv('title.akas.tsv.gz', sep='\t', usecols=['titleId',"region"], dtype={"region":'str'}).set_index('titleId')
region = region[region.region.isin({'IN'})]
region.value_counts().head(10)

region
IN        3933771
dtype: int64

In [7]:
import pandas as pd

In [10]:
name = pd.read_csv('name.basics.tsv.gz', sep='\t', na_values='\\N', dtype={'birthYear': float}).set_index('nconst')[['primaryName', 'birthYear']]
name_freq = cast['nconst'].value_counts()

In [11]:
def get_pairs(lang=None, min_acted=25, min_pairings=4):
    '''
    Returns an adjacency matrix and actor mapping of actor pairs where:
    - Each actor has acted in at least min_acted films
    - The two actors have acted together in at least min_pairings films
    - And (optionally), belong to a region `lang` (IN, UN, etc)
    '''
    graph = cast
    if lang is not None:
        graph = graph[graph['tconst'].isin(region[region == lang].index)]
    top_names = name_freq[name_freq >= min_acted]
    top_actors = graph[graph['nconst'].isin(top_names.index)]

    p = top_actors.copy()
    p['title'] = p['tconst'].astype('category')
    p['name'] = p['nconst'].astype('category')

    row = p['title'].cat.codes.values
    col = p['name'].cat.codes.values
    data = np.ones(len(p), dtype='int')

    matrix = csr_matrix((data, (row, col)))
    square = matrix.T * matrix
    square.setdiag(0)
    square = square.tocoo()

    pairs = pd.DataFrame({
        'row': square.row,
        'col': square.col,
        'n': square.data
    })
    pairs = pairs[pairs.n >= min_pairings].reset_index(drop=True)
    return pairs, name.reindex(p['name'].cat.categories)

def lookup(pairs, cat):
    pairs = pd.concat([
        pairs,
        cat.iloc[pairs.row].reset_index(drop=True),
        cat.iloc[pairs.col].reset_index(drop=True),
    ], axis=1)
    pairs = pairs.drop(columns=['row', 'col'])
    pairs.columns = ['count', 'name1', 'year1', 'name2', 'year2']
    return pairs.sort_values('count', ascending=False)

In [12]:
pairs_in, cat_in = get_pairs(lang='IN', min_acted=3, min_pairings=1)
lookup(pairs_in, cat_in)

Unnamed: 0,count,name1,year1,name2,year2
17981,32,Brahmanandam,1956.0,Mohammad Ali,1968.0
23993,32,Mohammad Ali,1968.0,Brahmanandam,1956.0
83521,20,Brahmanandam,1956.0,Krishna Bhagavan,
88942,20,Mohammad Ali,1968.0,Raghu Babu,
17957,20,Raghu Babu,,Mohammad Ali,1968.0
...,...,...,...,...,...
70191,1,Kishori Ballal,,Amin Hajee,
70192,1,Sachin Deshpande,,Amin Hajee,
70193,1,Celina Jaitly,1981.0,Amin Hajee,
70194,1,Amrita Arora,1981.0,Amin Hajee,


In [13]:
pairs_us, cat_us = get_pairs(lang='US', min_acted=3, min_pairings=1)
lookup(pairs_us, cat_us)

Unnamed: 0,count,name1,year1,name2,year2
17981,32,Brahmanandam,1956.0,Mohammad Ali,1968.0
23993,32,Mohammad Ali,1968.0,Brahmanandam,1956.0
83521,20,Brahmanandam,1956.0,Krishna Bhagavan,
88942,20,Mohammad Ali,1968.0,Raghu Babu,
17957,20,Raghu Babu,,Mohammad Ali,1968.0
...,...,...,...,...,...
70191,1,Kishori Ballal,,Amin Hajee,
70192,1,Sachin Deshpande,,Amin Hajee,
70193,1,Celina Jaitly,1981.0,Amin Hajee,
70194,1,Amrita Arora,1981.0,Amin Hajee,


In [15]:
pairs_in.head()

Unnamed: 0,row,col,n
0,7748,0,1
1,1685,0,1
2,290,0,1
3,5130,0,1
4,2676,0,1


In [16]:
algo = sknetwork.clustering.Louvain()
adjacency = sknetwork.utils.edgelist2adjacency(pairs_in)
labels = algo.fit_transform(adjacency)
clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster')], axis=1)

clusters_in = pd.concat([
    cat_in.reset_index(),
    pd.Series(labels, name='cluster'),
    pd.Series(clusters_in['index'].map(name_freq), name='freq'),
], axis=1)
clusters_in


Unnamed: 0,index,primaryName,birthYear,cluster,freq
0,nm0000002,Lauren Bacall,1924.0,0,3
1,nm0000056,Paul Newman,1925.0,0,3
2,nm0000084,Gong Li,1965.0,11,10
3,nm0000090,Armin Mueller-Stahl,1930.0,0,7
4,nm0000092,John Cleese,1939.0,0,17
...,...,...,...,...,...
19677,nm9969854,Rekha,,2,8
19678,nm9972198,Caylin Turner,,4,3
19679,nm9986430,Sebastian Cabanas,,14,3
19680,nm9988815,Nat Kitcharit,,13,3


In [17]:
clusters_in[clusters_in['cluster']==0].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
709,nm0001803,Danny Trejo,1944.0,0,101
2687,nm0290556,James Franco,1978.0,0,60
23,nm0000115,Nicolas Cage,1964.0,0,60
125,nm0000246,Bruce Willis,1955.0,0,58
233,nm0000448,Lance Henriksen,1940.0,0,55
217,nm0000418,Danny Glover,1946.0,0,55
182,nm0000353,Willem Dafoe,1955.0,0,55
2902,nm0332709,Olivier Gourmet,1963.0,0,54
1194,nm0023832,Mathieu Amalric,1965.0,0,52
63,nm0000168,Samuel L. Jackson,1948.0,0,50


In [18]:
clusters_in[clusters_in['cluster']==1].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
3565,nm0457410,Ravi Kishan,1971.0,1,114
3530,nm0451600,Anupam Kher,1955.0,1,76
3649,nm0474774,Akshay Kumar,1967.0,1,72
398,nm0000821,Amitabh Bachchan,1942.0,1,70
1928,nm0149822,Mithun Chakraborty,1950.0,1,68
1068,nm0006763,Jackie Shroff,1957.0,1,66
1078,nm0007106,Shakti Kapoor,1952.0,1,66
5210,nm0792116,Jimmy Shergill,1970.0,1,62
797,nm0004109,Gulshan Grover,1955.0,1,59
2337,nm0222426,Ajay Devgn,1969.0,1,58


In [19]:
clusters_in[clusters_in['cluster']==2].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
1696,nm0103977,Brahmanandam,1956.0,2,264
4742,nm0695177,Prakash Raj,1965.0,2,159
1170,nm0019382,Mohammad Ali,1968.0,2,128
4388,nm0621937,Nassar,1958.0,2,124
1541,nm0080238,Tanikella Bharani,1954.0,2,98
17391,nm6489058,Yogi Babu,1985.0,2,92
8071,nm1298052,Raghu Babu,,2,84
10697,nm2128968,'Ganja' Karuppu,,2,80
5682,nm0896573,Ashish Vidyarthi,1962.0,2,78
10708,nm2132667,M.S. Bhaskar,,2,74


In [20]:
clusters_in[clusters_in['cluster']==3].sort_values('freq', ascending=False).head(20)

Unnamed: 0,index,primaryName,birthYear,cluster,freq
1328,nm0043199,Avinash,,3,147
12060,nm2794335,Sadhu Kokila,,3,144
9433,nm1679254,Rangayana Raghu,,3,133
16667,nm5724719,Achyuth Kumar,1966.0,3,97
9490,nm1693209,Ramesh Bhat,,3,97
4349,nm0619047,Anant Nag,1948.0,3,81
9449,nm1682769,Bank Janardhan,,3,66
16184,nm5308603,Bullet Prakash,,3,64
3653,nm0474871,Sai Kumar,,3,62
16632,nm5701208,Sharath Lohitashwa,,3,52


In [21]:
def connectedness(clusters, pairs, cat):
    pairs['rowcluster'] = clusters.iloc[pairs.row].cluster.reset_index(drop=True)
    pairs['colcluster'] = clusters.iloc[pairs.col].cluster.reset_index(drop=True)

    connectedness, coclusters = {}, {}
    for index, costars in pairs.groupby('row'):
        coclusters[cat.index[index]] = clusterdist = costars.groupby('colcluster')['n'].sum()
        selfcluster = costars.rowcluster.iloc[0]
        connectedness[cat.index[index]] = {
            'primaryName': cat.primaryName.iloc[index],
            # Which cluster do they belong to
            'cluster': selfcluster,
            # No of clusters they've acted at least 5 times with
            'nclusters': (clusterdist >= 5).sum(),
            # No of films they've acted in
            'titles': name_freq[cat.index[index]],
            # No of pairings they've had with other stars
            'pairings': clusterdist.sum(),
            # % of films within cluster
            'incluster': clusterdist.get(selfcluster, 0) / clusterdist.sum()
        }
    coclusters = pd.DataFrame(coclusters).T
    connectedness = pd.DataFrame(connectedness).T
    return connectedness.sort_values('incluster'), coclusters

In [22]:
connected_in, coclusters_in = connectedness(clusters_in, pairs_in, cat_in)

In [23]:
# Who are the big crossover actors in IN?
connected_in[connected_in['titles'] > 50].sort_values('incluster').head(20)

Unnamed: 0,primaryName,cluster,nclusters,titles,pairings,incluster
nm0474609,Atul Kulkarni,7,5,51,120,0.233333
nm0000514,Michael Madsen,4,2,104,24,0.333333
nm0896573,Ashish Vidyarthi,2,5,78,233,0.377682
nm0222144,Rahul Dev,1,5,51,138,0.413043
nm0474871,Sai Kumar,3,4,62,187,0.44385
nm0348004,Milind Gunaji,7,5,85,285,0.491228
nm0149822,Mithun Chakraborty,1,3,68,201,0.507463
nm1418952,Bhavana,6,3,57,172,0.540698
nm3132784,Kishore Kumar G.,2,4,71,189,0.544974
nm0793851,Sayaji Shinde,2,4,71,222,0.563063
