# Hungarian mapping
If we have partitions over time, how do we map the clusters in consecutive time frames?

In [1]:
import pandas as pd
from nwtools import common, communities

## Prepare example dataset

In [52]:
unmatched_labels = pd.read_csv('unmatched_labels_per_year.csv', index_col='Id')

In [53]:
years = sorted(unmatched_labels.keys())

In [3]:
cluster_sets_per_year = {}
for year in unmatched_labels.columns:
    cluster_sets_per_year[year] = {}
    for cl in unmatched_labels[year].dropna().unique():
        cluster_sets_per_year[year][cl] = list(unmatched_labels[year][unmatched_labels[year]==cl].index)

In [58]:
cluster_list = []
for year in years:
    s = unmatched_labels[year].dropna().reset_index()
    cluster_list.append(s.groupby(year)['Id'].apply(set).to_dict())

In [None]:
## Algorithm

In [4]:
import itertools
import numpy as np

def jaccard(s1, s2):
    s1 = set(s1)
    s2 = set(s2)
    intersect = len(s1.intersection(s2))
    union = len(s1.union(s2))
    if union==0:
        return np.NaN
    return intersect/union

In [5]:
def jaccard_crosstab(part1, part2, keys1, keys2):
    sim = np.zeros((len(part1), len(part2)))
    for (i, c1), (j, c2) in itertools.product(enumerate(keys1), enumerate(keys2)):
        sim[i, j] = jaccard(part1[c1], part2[c2])
    return sim

In [6]:
from scipy.optimize import linear_sum_assignment
def get_mapping(ctab, row_labels, col_labels):
    if ctab.shape[0] > ctab.shape[1]:
        raise Exception('Rows should be fewer than columns')
        
    # Use the Hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(- ctab)
    mapXtoY = {row_labels[r]: col_labels[c] for r, c in zip(row_ind, col_ind)}
    return(mapXtoY)

In [7]:
import string
def map_labels_over_time(labels_list, jaccard=True, min_overlap=0.1, character_labels=True):
    if character_labels:
        new_labels = list(string.ascii_lowercase + string.ascii_uppercase)
    else:
        max_nr_of_labels = np.sum([len(s) for s in labels_list])
        new_labels = list(range(max_nr_of_labels))
    max_new_label = 0
    
    mappings = []

    part1 = None
    for t in range(len(labels_list)):
        part2 = labels_list[t]
        if part1 is None:
            m = {l: new_labels[i] for (i,l) in enumerate(part2.keys())}
            max_new_label = len(part2)
        else:
            col_labels = sorted(part1.keys()) 
            row_labels = sorted(part2.keys()) 

            ctab = jaccard_crosstab(part2, part1, row_labels, col_labels)

            # Add dummy columns
            ctab2 = np.hstack((ctab, np.ones((len(part2), len(part2)))*min_overlap))
            col_labels = col_labels + new_labels[max_new_label:max_new_label+len(part2)]
            m = get_mapping(ctab2, row_labels, col_labels)
            max_new_label = new_labels.index(max(m.values()))+1
        mappings.append(m)
        part1 = {m[c]: part2[c] for c in part2}
    return mappings

In [15]:
years = sorted(cluster_sets_per_year.keys())
labels_list = [cluster_sets_per_year[y] for y in years]
mappings_list = communities.map_labels_over_time(labels_list, jaccard=True, min_overlap=0.1, character_labels=True)

In [59]:
mappings_list = communities.map_labels_over_time(cluster_list, jaccard=True, min_overlap=0.1, character_labels=True)
mappings_list

[{3.0: 'a', 4.0: 'b', 5.0: 'c', 6.0: 'd', 7.0: 'e', 8.0: 'f'},
 {1.0: 'b', 2.0: 'e', 3.0: 'g', 4.0: 'a', 5.0: 'f'},
 {0.0: 'b', 1.0: 'h', 2.0: 'a', 3.0: 'e', 4.0: 'g', 5.0: 'f'},
 {0.0: 'b', 1.0: 'h', 2.0: 'a', 3.0: 'e', 4.0: 'f'},
 {1.0: 'h', 2.0: 'b', 3.0: 'i', 4.0: 'a', 5.0: 'j', 6.0: 'e', 7.0: 'f'},
 {1.0: 'a', 2.0: 'k', 3.0: 'i', 4.0: 'j', 5.0: 'e', 6.0: 'b', 7.0: 'f'},
 {1.0: 'a', 2.0: 'k', 3.0: 'i', 4.0: 'e', 5.0: 'b', 6.0: 'j', 7.0: 'f'},
 {42.0: 'l',
  80.0: 'b',
  83.0: 'i',
  88.0: 'm',
  89.0: 'e',
  91.0: 'j',
  92.0: 'f',
  95.0: 'k'},
 {72.0: 'b', 73.0: 'i', 79.0: 'e', 80.0: 'j', 81.0: 'f'},
 {2.0: 'i', 3.0: 'e', 4.0: 'f', 5.0: 'j', 6.0: 'k'},
 {0.0: 'e', 1.0: 'i', 2.0: 'l', 3.0: 'f', 4.0: 'j', 5.0: 'k'},
 {1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'f', 5.0: 'j', 6.0: 'k'},
 {1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'j', 5.0: 'f', 6.0: 'k'},
 {1.0: 'm', 2.0: 'e', 3.0: 'f', 4.0: 'l', 6.0: 'i', 7.0: 'j', 8.0: 'k'},
 {1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'm', 5.0: 'j', 6.0: 'f', 7.0: 'k'}

In [17]:
from IPython.display import display
import string

min_overlap = 0.1
new_labels = list(string.ascii_lowercase + string.ascii_uppercase)
max_new_label = 0

years = sorted(cluster_sets_per_year.keys())
mappings = {}

part1 = None
for year in years:
    print(year)
    part2 = cluster_sets_per_year[year]
    if part1 is None:
        m = {l: new_labels[i] for (i,l) in enumerate(part2.keys())}
        max_new_label = len(part2)
    else:
        col_labels = sorted(part1.keys()) 
        row_labels = sorted(part2.keys()) 
        
        ctab = jaccard_crosstab(part2, part1, row_labels, col_labels)
        
        # Add dummy columns
        ctab2 = np.hstack((ctab, np.ones((len(part2), len(part2)))*min_overlap))
        col_labels = col_labels + new_labels[max_new_label:max_new_label+len(part2)]
        display(pd.DataFrame(ctab2, index=row_labels, columns=col_labels).style.format("{:.2}").background_gradient(low=0, high=1, axis=1))
        m = get_mapping(ctab2, row_labels, col_labels)
        max_new_label = new_labels.index(max(m.values()))+1
    mappings[year] = m
    print(m)
    part1 = {m[c]: part2[c] for c in part2}

1995
{3.0: 'a', 4.0: 'b', 5.0: 'c', 6.0: 'd', 7.0: 'e', 8.0: 'f'}
1996


Unnamed: 0,a,b,c,d,e,f,g,h,i,j,k
1.0,0.074,0.36,0.062,0.0,0.03,0.026,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.05,0.0,0.81,0.0,0.1,0.1,0.1,0.1,0.1
3.0,0.12,0.23,0.0,0.0,0.051,0.21,0.1,0.1,0.1,0.1,0.1
4.0,0.28,0.031,0.0,0.12,0.0,0.024,0.1,0.1,0.1,0.1,0.1
5.0,0.031,0.0,0.05,0.0,0.0,0.54,0.1,0.1,0.1,0.1,0.1


{1.0: 'b', 2.0: 'e', 3.0: 'g', 4.0: 'a', 5.0: 'f'}
1997


Unnamed: 0,a,b,e,f,g,h,i,j,k,l,m
0.0,0.0,0.74,0.0,0.0,0.14,0.1,0.1,0.1,0.1,0.1,0.1
1.0,0.05,0.0,0.048,0.048,0.0,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.6,0.0,0.031,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.029,0.0,0.8,0.0,0.026,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.0,0.032,0.59,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.029,0.0,0.0,0.76,0.053,0.1,0.1,0.1,0.1,0.1,0.1


{0.0: 'b', 1.0: 'h', 2.0: 'a', 3.0: 'e', 4.0: 'g', 5.0: 'f'}
1998


Unnamed: 0,a,b,e,f,g,h,i,j,k,l,m
0.0,0.029,0.95,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
1.0,0.0,0.0,0.0,0.0,0.059,0.6,0.1,0.1,0.1,0.1,0.1
2.0,0.58,0.0,0.031,0.0,0.0,0.056,0.1,0.1,0.1,0.1,0.1
3.0,0.029,0.0,0.7,0.0,0.13,0.0,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.022,0.66,0.26,0.0,0.1,0.1,0.1,0.1,0.1


{0.0: 'b', 1.0: 'h', 2.0: 'a', 3.0: 'e', 4.0: 'f'}
1999


Unnamed: 0,a,b,e,f,h,i,j,k,l,m,n,o
1.0,0.059,0.0,0.0,0.0,0.4,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.95,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.087,0.0,0.033,0.22,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.35,0.0,0.036,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.043,0.0,0.11,0.12,0.083,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.029,0.026,0.64,0.043,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.0,0.0,0.0,0.53,0.05,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'h', 2.0: 'b', 3.0: 'i', 4.0: 'a', 5.0: 'j', 6.0: 'e', 7.0: 'f'}
2000


Unnamed: 0,a,b,e,f,h,i,j,k,l,m,n,o,p,q
1.0,0.44,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.036,0.0,0.0,0.056,0.0,0.48,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.0,0.86,0.0,0.091,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.66,0.021,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.0,0.0,0.0,0.88,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'a', 2.0: 'k', 3.0: 'i', 4.0: 'j', 5.0: 'e', 6.0: 'b', 7.0: 'f'}
2001


Unnamed: 0,a,b,e,f,i,j,k,l,m,n,o,p,q,r
1.0,0.71,0.0,0.0,0.048,0.0,0.067,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.44,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.0,0.0,0.57,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.95,0.0,0.0,0.0,0.036,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.0,0.0,0.0,0.0,0.7,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.0,0.0,0.0,0.82,0.028,0.042,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'a', 2.0: 'k', 3.0: 'i', 4.0: 'e', 5.0: 'b', 6.0: 'j', 7.0: 'f'}
2002


Unnamed: 0,a,b,e,f,i,j,k,l,m,n,o,p,q,r,s
42.0,0.0,0.033,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
80.0,0.0,0.57,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
83.0,0.0,0.021,0.025,0.0,0.63,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
88.0,0.0,0.069,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
89.0,0.0,0.1,0.38,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
91.0,0.0,0.024,0.029,0.0,0.0,0.62,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
92.0,0.042,0.022,0.0,0.62,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
95.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{80.0: 'b', 83.0: 'i', 88.0: 'm', 89.0: 'e', 42.0: 'l', 91.0: 'j', 92.0: 'f', 95.0: 'k'}
2003


Unnamed: 0,b,e,f,i,j,k,l,m,n,o,p,q,r
72.0,0.89,0.0,0.0,0.0,0.0,0.0,0.053,0.0,0.1,0.1,0.1,0.1,0.1
73.0,0.0,0.0,0.0,0.9,0.032,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
79.0,0.0,0.87,0.0,0.031,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
80.0,0.0,0.0,0.0,0.0,0.77,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
81.0,0.017,0.0,0.4,0.0,0.019,0.0,0.024,0.0,0.1,0.1,0.1,0.1,0.1


{72.0: 'b', 73.0: 'i', 81.0: 'f', 80.0: 'j', 79.0: 'e'}
2004


Unnamed: 0,b,e,f,i,j,k,l,m,n,o
2.0,0.0,0.0,0.0,0.85,0.037,0.1,0.1,0.1,0.1,0.1
3.0,0.45,0.46,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.53,0.0,0.0,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.019,0.14,0.035,0.22,0.1,0.1,0.1,0.1,0.1
6.0,0.048,0.0,0.068,0.0,0.0,0.1,0.1,0.1,0.1,0.1


{2.0: 'i', 3.0: 'e', 4.0: 'f', 5.0: 'j', 6.0: 'k'}
2005


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q
0.0,0.62,0.0,0.0,0.017,0.0,0.1,0.1,0.1,0.1,0.1,0.1
1.0,0.0,0.048,0.47,0.099,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.32,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.73,0.0,0.032,0.0,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.018,0.0,0.52,0.053,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.023,0.0,0.062,0.7,0.1,0.1,0.1,0.1,0.1,0.1


{0.0: 'e', 1.0: 'i', 2.0: 'l', 3.0: 'f', 4.0: 'j', 5.0: 'k'}
2006


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r
1.0,0.86,0.0,0.017,0.0,0.012,0.0,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.016,0.79,0.015,0.031,0.0,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.7,0.0,0.02,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.017,0.032,0.026,0.63,0.085,0.0,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.013,0.011,0.012,0.77,0.0,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'f', 5.0: 'j', 6.0: 'k'}
2007


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r
1.0,0.75,0.0,0.031,0.062,0.013,0.0,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.83,0.0,0.023,0.0,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.018,0.028,0.63,0.035,0.0,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.87,0.017,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.013,0.0,0.08,0.77,0.0,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'j', 5.0: 'f', 6.0: 'k'}
2008


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r,s
1.0,0.0,0.036,0.0,0.049,0.086,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.96,0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.57,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.015,0.069,0.79,0.0,0.011,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.0,0.0,0.029,0.64,0.06,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
8.0,0.0,0.057,0.0,0.06,0.66,0.016,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'm', 2.0: 'e', 3.0: 'f', 4.0: 'l', 6.0: 'i', 7.0: 'j', 8.0: 'k'}
2009


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r,s,t
1.0,0.86,0.0,0.015,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.74,0.0,0.012,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.027,0.0,0.0,0.0,0.016,0.82,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.0,0.0,0.023,0.033,0.0,0.38,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.016,0.02,0.013,0.67,0.046,0.0,0.045,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.69,0.057,0.0,0.015,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.013,0.0,0.044,0.048,0.7,0.0,0.016,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'e', 2.0: 'i', 3.0: 'l', 4.0: 'm', 5.0: 'j', 6.0: 'f', 7.0: 'k'}
2010


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r,s,t
0.0,0.88,0.0,0.0,0.0,0.0,0.029,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1.0,0.0,0.0,0.91,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.0,0.02,0.0,0.77,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.025,0.0,0.0,0.0,0.73,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.023,0.67,0.0,0.036,0.042,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.0,0.014,0.75,0.045,0.0,0.021,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.013,0.015,0.012,0.022,0.78,0.0,0.016,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{0.0: 'e', 1.0: 'i', 2.0: 'l', 3.0: 'm', 4.0: 'f', 5.0: 'j', 6.0: 'k'}
2011


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r,s,t
0.0,0.0,0.0,0.0,0.021,0.0,0.0,0.64,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1.0,0.92,0.022,0.0,0.0,0.013,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.0,0.0,0.0,0.0,0.92,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.82,0.014,0.024,0.0,0.026,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.0,0.86,0.0,0.0,0.028,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.0,0.014,0.78,0.056,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.0,0.013,0.024,0.75,0.017,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{0.0: 'm', 1.0: 'e', 2.0: 'l', 3.0: 'i', 4.0: 'f', 5.0: 'j', 6.0: 'k'}
2012


Unnamed: 0,e,f,i,j,k,l,m,n,o,p,q,r,s,t
1.0,0.89,0.0,0.0,0.0,0.0,0.0,0.03,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2.0,0.0,0.68,0.022,0.0,0.0,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3.0,0.0,0.0,0.0,0.02,0.037,0.0,0.5,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4.0,0.027,0.0,0.0,0.0,0.0,0.92,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1
5.0,0.0,0.094,0.7,0.013,0.024,0.0,0.022,0.1,0.1,0.1,0.1,0.1,0.1,0.1
6.0,0.0,0.016,0.014,0.73,0.047,0.0,0.02,0.1,0.1,0.1,0.1,0.1,0.1,0.1
7.0,0.0,0.0,0.014,0.049,0.75,0.0,0.0,0.1,0.1,0.1,0.1,0.1,0.1,0.1


{1.0: 'e', 2.0: 'f', 3.0: 'm', 4.0: 'l', 5.0: 'i', 6.0: 'j', 7.0: 'k'}


## Inspect results

In [11]:
for year in sorted(years):
    m = mappings[year]
    unmatched_labels['mapped_{}'.format(year)] = [m.get(c, None) for c in unmatched_labels[year]]

In [12]:
mapped_columns = ['mapped_{}'.format(year) for year in years]
community_labels = set(unmatched_labels[mapped_columns].values.flatten())
community_labels.remove(None)
community_labels = sorted(community_labels)
print(community_labels)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm']


In [13]:
unmatched_labels.loc['DE',]

1995            8
1996            5
1997            5
1998            4
1999            7
2000            7
2001            7
2002           92
2003           81
2004            4
2005            3
2006            4
2007            5
2008            3
2009            6
2010            4
2011            4
2012            2
mapped_1995     f
mapped_1996     f
mapped_1997     f
mapped_1998     f
mapped_1999     f
mapped_2000     f
mapped_2001     f
mapped_2002     f
mapped_2003     f
mapped_2004     f
mapped_2005     f
mapped_2006     f
mapped_2007     f
mapped_2008     f
mapped_2009     f
mapped_2010     f
mapped_2011     f
mapped_2012     f
Name: DE, dtype: object

In [14]:
community_labels

for comm in sorted(community_labels):
    print('Community {}'.format(comm))
    for i, year in enumerate(years):
        df_sub = unmatched_labels[unmatched_labels[mapped_columns[i]]==comm]
        if len(df_sub) > 0:
            print(year, list(df_sub.index))

Community a
1995 ['AE', 'BE', 'CH', 'DK', 'EG', 'FR', 'IL', 'LU', 'MA', 'MT', 'NL', 'NO', 'SA', 'SE', 'ZW']
1996 ['AR', 'BE', 'BI', 'CM', 'CY', 'DK', 'DZ', 'ES', 'FR', 'IE', 'KE', 'LU', 'MA', 'NL', 'RW', 'SE', 'TN']
1997 ['AR', 'BE', 'CM', 'DK', 'ES', 'IE', 'KE', 'LB', 'LU', 'NL', 'PE', 'RW', 'SE', 'SN', 'TN']
1998 ['AR', 'BE', 'DK', 'ES', 'FR', 'IE', 'KE', 'LB', 'LU', 'MA', 'MU', 'NL', 'SE', 'TG', 'TN']
1999 ['BE', 'CI', 'DZ', 'FR', 'LB', 'LU', 'MA', 'TN']
2000 ['DZ', 'FR', 'LY', 'MA', 'TN']
2001 ['CH', 'DZ', 'FR', 'LY', 'MA', 'TN', 'UY']
Community b
1995 ['AU', 'BN', 'CL', 'FJ', 'HK', 'ID', 'IE', 'KR', 'MY', 'NZ', 'PG', 'PH', 'SG', 'TH', 'TW', 'ZA']
1996 ['AE', 'CN', 'HK', 'ID', 'IR', 'JP', 'KR', 'MY', 'PH', 'SA', 'SG', 'TH', 'TW', 'VN']
1997 ['AE', 'AU', 'BN', 'CN', 'FJ', 'HK', 'ID', 'IR', 'JP', 'KR', 'MY', 'NZ', 'PG', 'PH', 'SA', 'SG', 'TH', 'TW', 'VN']
1998 ['AE', 'AU', 'BN', 'CN', 'FJ', 'HK', 'ID', 'IR', 'JP', 'KR', 'MY', 'NZ', 'PE', 'PG', 'PH', 'SA', 'SG', 'TH', 'TW', 'VN']
1999