## TFIDF Cluster Family Inspection

In this notebook, we compute the Term Frequency-Inverse Document Frequency statistics
used to validate our cluster family names as reported in the SI.

Executing this notebook requires access to the text data contained in the individual clusters,
which is not provided in the data accompanying the paper.
For the United States, the input data can be computed by running our preprocessing and clustering pipelines on the publicly available XML
from the Office of the Law Revision Counsel.
For Germany, we cannot make the input data available due to licensing restrictions.

### Preparations

In [1]:
import networkx as nx
from gensim.utils import simple_preprocess
from gensim import corpora, models
import pandas as pd
import multiprocessing

from legal_data_clustering.utils.graph_api import cluster_families

### Computing the statistics

In [2]:
def load_cluster_families(base_path):
    G = nx.read_gpickle(
        base_path+'13_cluster_evolution_graph/all_0-0_1-0_-1_a-infomap_n100_m1-0_s0_c1000.gpickle.gz'
    )
    cluster_families_data = cluster_families(G,threshold=.15)[:50]
    leading_clusters = [c[0] for c in cluster_families_data]
    return cluster_families_data, leading_clusters

In [3]:
def read_cluster_texts(node, base_path):
    year, cluster = node.split('_')
    with open(f'{base_path}12_cluster_texts/{year}_0-0_1-0_-1_a-infomap_n100_m1-0_s0_c1000/community_{cluster}.txt') as f:
        return f.read()

In [4]:
def process_cluster_familie(clusters, base_path):
    doc = ' '.join(
        read_cluster_texts(c, base_path) 
        for c in clusters
    )
    return simple_preprocess(doc)

In [5]:
def compute_tfidf_csv(dataset):
    base_path = f'../../legal-networks-data/{dataset}/'
    cluster_families_data, leading_clusters = load_cluster_families(base_path)
    
    dictionary = corpora.Dictionary()
    
    BoW_corpus = []
    
    for i, c in enumerate(cluster_families_data):
        doc = process_cluster_familie(c, base_path)
    
        bow =  dictionary.doc2bow(doc, allow_update=True)
        BoW_corpus.append(bow)
        print('done', i)
        
    tfidf = models.TfidfModel(BoW_corpus, smartirs='ntc')
    data = [
        {dictionary[key]: freq for key, freq in doc}
        for doc in tfidf[BoW_corpus]
    ]
    data_sorted = [
        sorted([x for x in cluster_family.items()], key=lambda y: y[-1], reverse=True)
        for cluster_family in data
    ]
    df = pd.DataFrame({
        leading: [word for word, cnt in fam_data[:250]]
        for leading, fam_data in zip(leading_clusters, data_sorted)
    })
    df.to_csv(f'../results/tfidf_cluster_family_inspection_{dataset}.csv')

In [6]:
compute_tfidf_csv('us_reg')

done 0
done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9
done 10
done 11
done 12
done 13
done 14
done 15
done 16
done 17
done 18
done 19
done 20
done 21
done 22
done 23
done 24
done 25
done 26
done 27
done 28
done 29
done 30
done 31
done 32
done 33
done 34
done 35
done 36
done 37
done 38
done 39
done 40
done 41
done 42
done 43
done 44
done 45
done 46
done 47
done 48
done 49


In [7]:
compute_tfidf_csv('de_reg')

done 0
done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9
done 10
done 11
done 12
done 13
done 14
done 15
done 16
done 17
done 18
done 19
done 20
done 21
done 22
done 23
done 24
done 25
done 26
done 27
done 28
done 29
done 30
done 31
done 32
done 33
done 34
done 35
done 36
done 37
done 38
done 39
done 40
done 41
done 42
done 43
done 44
done 45
done 46
done 47
done 48
done 49
