In [1]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns

import ihop
from ihop.community2vec import GensimCommunity2Vec
import ihop.clustering as ic

sns.set_palette("Dark2")

In [2]:
DATA_ROOT = Path("../data")
C2V_ROOT = DATA_ROOT / "community2vec"
EXPERIMENTS_ROOT = DATA_ROOT / "cluster_alignment"

n_clusters = 100
link_type = "ward"

MONTHS = sorted(["2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12", "2022-01", "2022-02", "2022-03"
])

C2V_MODELS = [GensimCommunity2Vec.load(C2V_ROOT / f"RC_{m}"/"best_model") for m in MONTHS]

In [3]:
# Train models
hac_metrics_collection = []
for month, c2v_model in zip(MONTHS, C2V_MODELS):
    print("Training agglomerative clustering models for month:", month)
    vectors = c2v_model.w2v_model.wv
    data = c2v_model.get_normed_vectors()
    index = dict(enumerate(vectors.index_to_key))
    # Month path
    agg_month_path = EXPERIMENTS_ROOT / f"{month}"
    agg_month_path.mkdir(parents=True, exist_ok=True)
    

    current_agg_model_path = agg_month_path / f"{month}_agglomerative_Linkage{link_type}_numClusters{n_clusters}"
    name = f"{month} Agglomerative Clustering {link_type} linkage {n_clusters} clusters"
    if not current_agg_model_path.exists(): 
        affinity="euclidean" if link_type=="ward" else "cosine"
        agg_model = ic.ClusteringModelFactory.init_clustering_model(model_choice = "agglomerative", data = data, index=index, model_name = name, n_clusters=n_clusters, linkage=link_type, affinity=affinity)
        agg_model.train()
        agg_model.save(current_agg_model_path)
        metrics = agg_model.get_metrics()
        with( current_agg_model_path / "metrics.json").open(mode='w') as metrics_json:
            json.dump(metrics, metrics_json, cls = ihop.utils.NumpyFloatEncoder)
    else: 
        print("\tModel", name, " appears to already exist")
        with (current_agg_model_path / "metrics.json").open(mode="r") as metrics_json:
            metrics = json.load(metrics_json)
        metrics.update({"linkage": link_type, "path":str(current_agg_model_path), "month":month, "n_clusters": n_clusters})
        hac_metrics_collection.append(metrics)
    
hac_metrics_df = pd.DataFrame.from_records(hac_metrics_collection)
display(hac_metrics_df) 

Training agglomerative clustering models for month: 2021-04
Training agglomerative clustering models for month: 2021-05
Training agglomerative clustering models for month: 2021-06
Training agglomerative clustering models for month: 2021-07
Training agglomerative clustering models for month: 2021-08
Training agglomerative clustering models for month: 2021-09
Training agglomerative clustering models for month: 2021-10
Training agglomerative clustering models for month: 2021-11
Training agglomerative clustering models for month: 2021-12
Training agglomerative clustering models for month: 2022-01
Training agglomerative clustering models for month: 2022-02
Training agglomerative clustering models for month: 2022-03


# Aligning clusters
The previous experiments showed that the changes in the clusterings over a variety of experiments are "on average relatively low" compared to the full range of the stability metric. But that doesn't tell us how the changes around a particular subreddit or cluster would appear to a user. 

We use [Maximum Matching](https://www.dcs.bbk.ac.uk/~ale/dsta/2020-21/dsta-8/zaki-meira-ch17-excerpt.pdf) to find majority matching clusters between two subreddit clusterings, since we're interested in pairwise matches between clusters across months.


In [4]:
def pair_clusters_across_clusterings(clustering_a, clustering_b, sort_key=0):
    """
    :param clustering_a: dict, subreddit -> cluster assignment
    :param clustering_b: dict, subreddit -> cluster assignment
    :param sort_key: 0 or 1, sort by decreasing order of cluster ids of first or second cluster
    """
    assignment_a, assignment_b, all_subreddits = ic.remap_clusters(clustering_a, clustering_b, use_union=True)

    # If the cluster id is -1, then the subreddit was missing from the clustering, so its count
    # is zero 
    map_counts_fn = lambda c: 0 if c==-1 else 1
    counts_a = list(map(map_counts_fn, assignment_a))
    counts_b = list(map(map_counts_fn, assignment_b))

    # Leave off the first -1 "cluster id", which is the indicator that the subreddit is missing from this clustering 
    row_mapping = sorted(set(assignment_a))[1:]
    col_mapping = sorted(set(assignment_b))[1:]
    contingency_table = ic.get_contingency_table(assignment_a, assignment_b, counts_a, counts_b, row_mapping, col_mapping)
    pairs = ic.get_maximum_matching_pairs(contingency_table, np.array(row_mapping), np.array(col_mapping))
    result = [[p, pairs[1][i]] for i, p in enumerate(pairs[0])]
    # Sort by decreasing cluster ids for the second cluster
    return sorted(result, key= lambda l:l[sort_key], reverse=True)

def get_cluster_assignments_with_subreddits(cluster_model):
    """Returns a dataframe that has one line per cluster with a column storing all the subreddits in 
    that cluster in alphabetical order
    """
    #TODO Allow for ordering based on popularity
    subreddit_clusters_df = cluster_model.get_cluster_results_as_df().\
        groupby(cluster_model.model_name)["subreddit"].\
        apply(lambda x: " ".join(sorted(set(x)))).reset_index()
    records = subreddit_clusters_df.to_dict("records")
    return {d[cluster_model.model_name]: d["subreddit"] for d in records}


# The following functions are only useful for comparing KMeans experiments, when you have to pick from several models for each month
def find_min_max_comparison_metric_experiment(comparison_df, current_experiment_number, month1, month2, n_clusters, metric="intersection_uniform_probability_variation_of_information"):
    """Return the experiment ids that reach the the min and max of the given metric value relative to the current experiment along with the actual value of the metric"""
    exp_subset_df = comparison_df[comparison_df["n_clusters"] == n_clusters]
    exp_subset_df = exp_subset_df[(exp_subset_df["month1"] == month1) & (exp_subset_df["month2"] == month2)]
    exp_subset_df = exp_subset_df[exp_subset_df["month1_experiment_id"] == current_experiment_number]
    min_row = exp_subset_df[exp_subset_df[metric] == exp_subset_df[metric].min()]
    min_id = min_row["month2_experiment_id"].values[0]
    min_val = min_row[metric].values[0]
    max_row = exp_subset_df[exp_subset_df[metric] == exp_subset_df[metric].max()]
    max_id = max_row["month2_experiment_id"].values[0]
    max_val = max_row[metric].values[0]
    return (min_id, min_val), (max_id, max_val)


def get_cluster_stability_experiment_path(root, month, n_clusters, exp_id):
    return root / month / f"{month}_numClusters{n_clusters}" / f"{exp_id}"


def load_min_max_cluster_models(comparison_df, current_experiment_number, month1, month2, n_clusters, month2_c2v_model):
    """Load the clustering models from month2 that have min and max comparison metric relative to the given experiment in month1
    """
    min_exp, max_exp= find_min_max_comparison_metric_experiment(comparison_df, current_experiment_number, month1, month2, n_clusters)
    print("Minimum model experiment id: ", min_exp[0], ", Metric value:", min_exp[1])
    print("Maximum model experiment id: ", max_exp[0], ", Metric value:", max_exp[1])
    
    min_model_path = get_cluster_stability_experiment_path(EXPERIMENTS_ROOT, month2, n_clusters, min_exp[0])
    min_model = ic.ClusteringModel.load(min_model_path, month2_c2v_model.get_normed_vectors(), month2_c2v_model.get_index_as_dict())
    max_model_path = get_cluster_stability_experiment_path(EXPERIMENTS_ROOT, month2, n_clusters, max_exp[0])
    max_model = ic.ClusteringModel.load(max_model_path, month2_c2v_model.get_normed_vectors(), month2_c2v_model.get_index_as_dict())
    return min_model, max_model

In [5]:
current_n_clusters = 100
linkage="ward"

month1 = MONTHS[0]
month1_c2v_model = C2V_MODELS[0]
month1_model = ic.ClusteringModel.load(EXPERIMENTS_ROOT / month1 / f"{month1}_agglomerative_Linkage{linkage}_numClusters{current_n_clusters}", month1_c2v_model.get_normed_vectors(), month1_c2v_model.get_index_as_dict())
month1_assignments = month1_model.get_cluster_assignments_as_dict()

# Track string representations for clusterings to more easily generate CSV output
cluster_id_to_subreddit_strings = [get_cluster_assignments_with_subreddits(month1_model)]
model_names = [month1_model.model_name]

# Go backward through months
for i in range(1, len(MONTHS)):
    month2 = MONTHS[i]
    month2_c2v_model = C2V_MODELS[i]
    month2_model = ic.ClusteringModel.load(EXPERIMENTS_ROOT / month2 / f"{month2}_agglomerative_Linkage{linkage}_numClusters{current_n_clusters}", month2_c2v_model.get_normed_vectors(), month2_c2v_model.get_index_as_dict() )
    month2_assignments = month2_model.get_cluster_assignments_as_dict()

    if i == 1: 
        all_pairings = pair_clusters_across_clusterings(month1_assignments, month2_assignments, sort_key=1)
    
    else: 
        current_pairings = pair_clusters_across_clusterings(month1_assignments, month2_assignments)

        # Interleave the current pairing with all assignments
        current_pairing_idx = 0
        extend_pairings = []
        for j, pairing_list in enumerate(all_pairings):
            cluster_id = pairing_list[-1]
            current_pairing_cluster_id = current_pairings[current_pairing_idx][0]
            
            # This list doesn't have any further alignments, just fill with -1 to indicate missing cluster
            if cluster_id < 0:
                all_pairings[j].append(-1)

            # This cluster should be aligned with the current month comparison somehow
            elif cluster_id == current_pairing_cluster_id:
                # Add the latest month's aligned cluster to the chain
                all_pairings[j].append(current_pairings[current_pairing_idx][-1])
                current_pairing_idx +=1
            else:
                ValueError(f"Trying to chain cluster id {cluster_id} to {current_pairing_cluster_id}, values must match and do not")

        # Append any "new" clusters in chain (clusters weren't aligned to previous models)
        while current_pairing_idx < len(current_pairings):
            all_pairings.append([-1]*(i-1) + current_pairings[current_pairing_idx])
            current_pairing_idx +=1 


        # Re-sort all pairings by the latest month's cluster id
        all_pairings = sorted(all_pairings, key = lambda l: l[-1], reverse=True)

    #Advance the next month
    month1 = month2
    month1_c2v_model = month2_c2v_model
    month1_model = month2_model
    month1_assignments = month2_assignments
    cluster_id_to_subreddit_strings.append(get_cluster_assignments_with_subreddits(month2_model))
    model_names.append(month2_model.model_name)


# Build up the human readable representation of these cluster chains
header_row = []
for i, mn in enumerate(model_names):
    header_row.append(mn)
    header_row.append(mn + " subreddits")
header_row.append("Alignment chain length")

# Grab subreddit representations for each entry
output_rows = []
for i, cluster_chain in enumerate(all_pairings):
    if len(cluster_chain) != len(model_names):
        raise ValueError(f"Model alginment chain at row {i} doesn't match number of models ({len(model_names)}): {cluster_chain}")
    this_row = []
    for j, cluster_id in enumerate(cluster_chain):
        this_row.append(cluster_id)
        this_row.append(cluster_id_to_subreddit_strings[j].get(cluster_id, ""))
    alignment_length = len(list(filter(lambda x: x!=-1, cluster_chain)))
    this_row.append(alignment_length)
    output_rows.append(this_row)


comparison_df = pd.DataFrame(output_rows, columns=header_row)
comparison_df.sort_values(by="Alignment chain length", ascending=False, inplace=True)
display(comparison_df)
comparison_df.to_csv(EXPERIMENTS_ROOT / "agglomerative_clustering_monthly_alignments.csv",index=False)


Unnamed: 0,2021-04 Agglomerative Clustering ward linkage 100 clusters,2021-04 Agglomerative Clustering ward linkage 100 clusters subreddits,2021-05 Agglomerative Clustering ward linkage 100 clusters,2021-05 Agglomerative Clustering ward linkage 100 clusters subreddits,2021-06 Agglomerative Clustering ward linkage 100 clusters,2021-06 Agglomerative Clustering ward linkage 100 clusters subreddits,2021-07 Agglomerative Clustering ward linkage 100 clusters,2021-07 Agglomerative Clustering ward linkage 100 clusters subreddits,2021-08 Agglomerative Clustering ward linkage 100 clusters,2021-08 Agglomerative Clustering ward linkage 100 clusters subreddits,...,2021-11 Agglomerative Clustering ward linkage 100 clusters subreddits,2021-12 Agglomerative Clustering ward linkage 100 clusters,2021-12 Agglomerative Clustering ward linkage 100 clusters subreddits,2022-01 Agglomerative Clustering ward linkage 100 clusters,2022-01 Agglomerative Clustering ward linkage 100 clusters subreddits,2022-02 Agglomerative Clustering ward linkage 100 clusters,2022-02 Agglomerative Clustering ward linkage 100 clusters subreddits,2022-03 Agglomerative Clustering ward linkage 100 clusters,2022-03 Agglomerative Clustering ward linkage 100 clusters subreddits,Alignment chain length
73,6,2007scape 7DSGrandCross AQW AdmiralBulldog Ana...,36,2007scape 7daystodie AdmiralBulldog AshesofCre...,23,AdmiralBulldog AshesofCreation BobsTavern Comp...,60,100thieves 2007scape AQW AdmiralBulldog Androi...,21,AdmiralBulldog ArenaHS AshesofCreation Asmongo...,...,2007scape 7kglobal AlchemyStarsEN AnimeNYC Ano...,40,2007scape AlanWake AshesofCreation BaldursGate...,45,2007scape AnarchyChess AshesofCreation Asmongo...,46,2007scape AdmiralBulldog Archero AshesofCreati...,26,2007scape AnarchyChess Archero AxieInfinity BA...,12
41,92,240sx 350z 370z AMG Acura Acura_RSX AskMechani...,56,240sx 350z 370z 4Runner 4x4 AMG ATV Acura AskM...,88,240sx 350z 370z AMG Acura AndroidAuto AskMecha...,22,240sx 300zx 350z 370z 3rdGen4Runner 4Runner 4x...,25,240sx 350z 370z 3rdGen4Runner 4Runner 4x4 AMG ...,...,240sx 350z 370z 3rdGen4Runner 4Runner 4x4 AMG ...,85,240sx 350z 370z AMG Acura AskMechanics Audi Au...,36,240sx 350z 370z AMG Acura AskMechanics Audi Au...,78,240sx 350z 370z 3rdGen4Runner 4Runner 4x4 AMG ...,58,240sx 350z 370z AMG Acura Acura_RSX AskMechani...,12
60,23,ActionFigures Arrowverse AskScienceFiction Bat...,78,ActionFigures CloneWarsMemes EmpireDidNothingW...,28,ActionFigures Arrowverse BatmanArkham Batwoman...,61,ActionFigures AlignmentCharts Arrowverse AskSc...,56,Arrowverse AskScienceFiction Avengers BatmanAr...,...,ActionFigures AskScienceFiction Avengers Batma...,23,Acceleracers ActionFigures Arrowverse Avengers...,90,AEWOfficial ActionFigures Avengers BatmanArkha...,22,4kbluray A24 AMCsAList Avengers BatmanArkham B...,39,4kbluray AMCsAList ActionFigures Avengers Baku...,12
59,25,13ReasonsWhy 52book AMillionLittleThings ANGEL...,1,13ReasonsWhy 23andme 30ROCK 911FOX ACNHTrade A...,78,13ReasonsWhy 23andme 30ROCK ACPocketCamp AMill...,16,13ReasonsWhy 30ROCK ANGEL ANTM AllAmericanTV A...,58,13ReasonsWhy ANTM ANW AfterPrisonShow AllAmeri...,...,911FOX ANGEL ANTM AllAmericanTV AmericanHorror...,33,911FOX ANGEL AmericanHorrorStory BoJackHorsema...,77,AmericanHorrorStory BoJackHorseman ChicagoPD D...,87,1883Series 30ROCK ANGEL AllOfUsAreDead America...,40,1883Series 30ROCK 911FOX 911LoneStar A24 ANGEL...,12
58,22,13or30 2healthbars ATBGE AbandonedPorn AbruptC...,11,2healthbars ATBGE AbruptChaos AbsoluteUnits Ac...,48,13or30 ATBGE AbruptChaos AbsoluteUnits ActualF...,33,2healthbars 2meirl4meirl ANormalDayInRussia AT...,18,2020PoliceBrutality ATBGE AbruptChaos Absolute...,...,AbruptChaos AbsoluteUnits ActualFreakouts Actu...,28,AbruptChaos AbsoluteUnits ActualFreakouts Actu...,53,13or30 ATBGE AbandonedPorn AccidentalRacism Ac...,95,AbruptChaos AbsoluteUnits BeAmazed BetterEvery...,41,AbruptChaos AbsoluteUnits ActualPublicFreakout...,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,-1,,-1,,-1,,-1,,96,ActionFigures Funko GalaxysEdge Legomarket Mar...,...,,-1,,-1,,-1,,-1,,1
132,-1,,-1,,-1,,-1,,83,196 196x 197 2american4you 4chan 691 Apandah B...,...,,-1,,-1,,-1,,-1,,1
134,-1,,-1,,-1,,-1,,51,AlternativeHistory AntiVaxxers Bibleconspiracy...,...,,-1,,-1,,-1,,-1,,1
138,-1,,-1,,-1,,88,Albany AskNYC BostonBruins Brooklyn Buffalo Ca...,-1,,...,,-1,,-1,,-1,,-1,,1
