# Results section 2: connectivity
## Meso \-level connectivity

### Preparations

In [1]:
%run fix_notebook_imports.py

In [2]:
import textwrap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

from analysis.statics import COUNTRIES, YEARS

In [3]:
sns.set_style('whitegrid')
plt.rcParams['font.size'] = 30

In [4]:
def get_growth_fractions(largest_family_volumes, labels):
    growth_fractions = pd.DataFrame(columns=['label', 'total','statute','statute_fraction','regulation','regulation_fraction','category','majority'])
    for family in labels.index:
        total_growth = largest_family_volumes.loc[family].loc[2019].sum() - largest_family_volumes.loc[family].loc[1998].sum()
        sta_growth = largest_family_volumes.loc[family].loc[2019].statute - largest_family_volumes.loc[family].loc[1998].statute
        reg_growth = largest_family_volumes.loc[family].loc[2019].regulation - largest_family_volumes.loc[family].loc[1998].regulation
        sta_growth_rel = round(sta_growth / total_growth,2)
        reg_growth_rel = round(reg_growth / total_growth,2)
        growth_fractions.loc[family] = [labels.at[family,'label'], total_growth, 
                                        sta_growth, sta_growth_rel, 
                                        reg_growth, reg_growth_rel,
                                        'S' if sta_growth_rel >= 0.8 else ('R' if reg_growth_rel >= 0.8 else 'M'),
                                        'S' if sta_growth_rel > reg_growth_rel else ('R' if reg_growth_rel > sta_growth_rel else 'M')
                                       ]
    return growth_fractions

def get_average_composition(largest_family_volumes, labels):
    df = pd.DataFrame((largest_family_volumes.loc[x].statute / (largest_family_volumes.loc[x].regulation + largest_family_volumes.loc[x].statute)).describe()[1:] for x in labels.index)
    df = df.round(2)
    df['category'] = ['S' if x >= 0.8 else ('R' if x <= 0.2 else 'M') for x in df['mean']]
    df['majority'] = ['S' if x > 0.5 else ('R' if x < 0.5 else 'M') for x in df['mean']]
    df['label'] = labels.label.values
    return df[[df.columns[-1],*df.columns[:-1]]]

def plot_family_composition_absolute(largest_families, largest_family_volumes, labels, save_path=None):
    fig, ax = plt.subplots(2,5,figsize=(5*5,2*5.5),sharey=True,sharex=True)
    for idx,comm in enumerate(largest_families):
        x, y = divmod(idx,5)
        single_community_df = largest_family_volumes.loc[comm].sort_index(ascending=False)
        single_community_df.index = map(str, single_community_df.index) # Fix that order of bars in plot is identical to order in DataFrame
        single_community_df[["statute","regulation"]].plot.barh(ax=ax[x,y], stacked=True, color=['k','b'], use_index=False,
            title=textwrap.fill(labels.at[comm, "label"], 20) if comm in labels.index and not pd.isna(labels.at[comm, "label"]) else str(int(comm)),
            legend=False,width=1,linewidth=0.1,alpha=0.5)
        yticks = [item for sublist in [[str(x),"",""] for x in YEARS[::3]] for item in sublist]
        ax[x,y].yaxis.set_ticks(yticks)
        ax[x,y].yaxis.set_ticklabels(yticks)
        ax[x,y].title.set_size(30)
        ax[x,y].tick_params(axis='both', which='major', labelsize=30)
        if country == 'de':
            ax[x,y].xaxis.set_ticks([0,0.5e6,1.0e6])
            ax[x,y].xaxis.set_ticklabels([0,0.5,str(1.0)+" e6"])
            ax[x,y].set_xlim(0,1.1e6)
        else:
            ax[x,y].xaxis.set_ticks([0,0.5e7,1.0e7,1.5e7])
            ax[x,y].xaxis.set_ticklabels([0,0.5,1.0,str(1.5)+" e7"])
            ax[x,y].set_xlim(0,1.65e7)
        ax[x,y].set_ylabel("Year",fontsize=30)
        ax[x,y].set_xlabel("Number of tokens",fontsize=30)
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, transparent=True)
        plt.close()
        
def plot_family_composition_relative(largest_families, largest_family_volumes, labels, save_path=None):
    fig, ax = plt.subplots(2,5,figsize=(5*5,2*5.5),sharey=True,sharex=True)
    for idx,comm in enumerate(largest_families):
        x, y = divmod(idx,5)
        single_community_df = largest_family_volumes.loc[comm].sort_index(ascending=False)
        single_community_df.index = map(str, single_community_df.index) # Fix that order of bars in plot is identical to order in DataFrame
        single_community_df = single_community_df.div(single_community_df.sum(axis=1),axis=0)
        single_community_df[["statute","regulation"]].plot.barh(ax=ax[x,y], stacked=True, color=['k','b'],xlim=(0,1),xticks=np.arange(0,1.2,0.2),
                                                                use_index=False,
            title=textwrap.fill(labels.at[comm, "label"], 20) if comm in labels.index and not pd.isna(labels.at[comm, "label"]) else str(comm),
            legend=False,width=1,linewidth=0.1,alpha=0.5)
        yticks = [item for sublist in [[str(x),"",""] for x in YEARS[::3]] for item in sublist]
        ax[x,y].yaxis.set_ticks(yticks)
        ax[x,y].yaxis.set_ticklabels(yticks)
        ax[x,y].title.set_size(30)
        ax[x,y].tick_params(axis='both', which='major', labelsize=30)
        ax[x,y].set_ylabel("Year",fontsize=30)
        ax[x,y].set_xlabel("Fraction of tokens",fontsize=30)
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, transparent=True)
        plt.close()

### Figures

In [5]:
# The uncommented lines ensure that we don't overwrite already finally styled tables (rules instead of hlines)
for country in COUNTRIES:
    for n_clusters in [100]:
        labels = pd.read_csv(f"../supplements/cluster-family-labels-new-n{n_clusters}-{country}-filled.csv").set_index('family')
        cluster_family_volumes = pd.read_csv(f"../results/cluster-family-volumes-{country}-n{n_clusters}.csv")
        largest_families = list(labels.index)
        largest_family_volumes = cluster_family_volumes.query("family in @largest_families").drop("total",axis=1).groupby(["family","year"]).sum()
        growth_fractions = get_growth_fractions(largest_family_volumes, labels)
#         with open(f"../graphics/cluster-family-volumes-growth-{country}-n{n_clusters}.tex", "w") as f:
#             f.write(tabulate(growth_fractions, tablefmt='latex_raw', headers=['Family',r'$\Delta$',
#                                                           r'$\Delta_S$',r'$\Delta_S/\Delta$',
#                                                           r'$\Delta_R$',r'$\Delta_R/\Delta$','Cat.','Maj.'], showindex=False))
        #growth_fractions.to_csv(f"../results/cluster-family-volumes-growth-{country}-n{n_clusters}.csv", index=False)
        average_compositions = get_average_composition(largest_family_volumes, labels)
#         with open(f"../graphics/cluster-family-volumes-composition-{country}-n{n_clusters}.tex", "w") as f:
#             f.write(tabulate(average_compositions, tablefmt='latex_raw', headers=['Family', r'$\mu$', r'$\sigma$', r'$\min$', r'$25~\%$', r'$50~\%$', r'$75~\%$', r'$\max$',
#        'Cat.', 'Maj.'], showindex=False))
        #average_compositions.to_csv(f"../results/cluster-family-volumes-composition-{country}-n{n_clusters}.csv", index=False)
        plot_family_composition_absolute(largest_families, largest_family_volumes, labels, save_path=f"../graphics/family-composition-absolute-n{n_clusters}-{country}.pdf")
        plot_family_composition_relative(largest_families, largest_family_volumes, labels, save_path=f"../graphics/family-composition-relative-n{n_clusters}-{country}.pdf")

In [6]:
path = '../graphics/family-compositions'
if not os.path.exists(path):
    os.makedirs(path)
for country in COUNTRIES:
    cluster_family_volumes = pd.read_csv(f"../results/cluster-family-volumes-{country}-n100.csv"
                                        ).sort_values('total', ascending=False)
    for year in YEARS:
        year_vols = pd.DataFrame(cluster_family_volumes.query("year == @year").sort_values('total', ascending=False)[:100].fillna(0))
        year_vols['statute_percentage'] = year_vols.statute / year_vols.total
        year_vols['normalized_total'] = (year_vols.total - year_vols.total.min()
                                                    ) / (year_vols.total.max() - year_vols.total.min())
        g = sns.jointplot(y='normalized_total', x='statute_percentage', data=year_vols, kind="hist", color='k')
        
        g.set_axis_labels('Percentage of statute tokens', 'Min-max normalized number of tokens', fontsize=17)
        g.ax_joint.set_xticks(np.arange(0,1.01,0.5))
        plt.tight_layout()
        plt.savefig(f"{path}/family-hist2d-{country}-{year}.pdf")
        plt.close()

### The end.