In [None]:
import sys 
sys.path.append('..')
from utils.utils import load_dataframes, load_dataframe, write_df_to_csv, write_dict_to_csv
from utils.paths import HASHTAG_FREQUENCY_DIR, HASHTAGS_PER_USER_DIR, GREEDY_MODULARITY_DIR, make_dirs, make_dir
import pandas as pd
import networkx as nx
import os
from networkx.algorithms.community import greedy_modularity_communities
from itertools import combinations

make_dirs()

min_count = 10

In [None]:
def generate_graph_communities(df_frequencies, df_users, min_count, forbidden_hashtags=[]):
    df_frequencies.drop(df_frequencies[df_frequencies['count'] < min_count].index, inplace = True)
    G = nx.Graph()
    hashtags = [x for x in list(df_frequencies['hashtag']) if x not in forbidden_hashtags]
    df_users['hashtags'] = df_users['hashtags'].apply(lambda x: [hashtag for hashtag in x if hashtag in hashtags and not hashtag in forbidden_hashtags])

    df_frequencies = df_frequencies[~ df_frequencies['hashtag'].isin(forbidden_hashtags)]

    # Add a node to the graph for every hashtag
    for index, row in df_frequencies.iterrows():
        G.add_node(row.hashtag, count=row['count'])
    
    # For every user: add edges in between all nodes corresponding to pairs or hashtags that they mentioned
    for index, row in df_users.iterrows():
        hashtags = row.hashtags
        pairs = list(combinations(hashtags, 2))

        # Add a 1/(number of hashtags that a user mentioned) to the weight of every edge that the user is involved in
        if len(hashtags) > 0:
            weight = 1/(len(hashtags)) 
            for pair in pairs: 
                if pair in G.edges:
                    data = G.get_edge_data(pair[0], pair[1])
                    G.add_edge(pair[0], pair[1], weight=data['weight']+weight)
                else:
                    G.add_edge(pair[0], pair[1], weight=weight)
    return G

In [None]:
def find_clusters(out_super_dir, min_count, quarter, resolution, df_frequency, df_user, forbidden_hashtags):
    out_dir = os.path.join(out_super_dir, quarter)
    make_dir(out_dir)   
    G = generate_graph_communities(df_frequency, df_user, min_count, forbidden_hashtags)


    if len(G.edges) > 0:
        communities = greedy_modularity_communities(G, weight='weight', resolution=resolution)

        all_clusters = {}

        for community in communities:
            H = G.subgraph(community)
            weights = nx.get_edge_attributes(H, 'weight')
            counts = nx.get_node_attributes(H,'count')
            community_size = sum(weights.values())
            hashtags = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))

            list_most_common = list(hashtags.keys())

            length_cluster_name = min(len(list_most_common), 3)
            most_common = ''
            for i in range(length_cluster_name):
                most_common = most_common + list_most_common[i] + '_'
            all_clusters[most_common] = community_size
            write_dict_to_csv(hashtags, most_common, out_dir, ['hashtag', 'count'])

        all_clusters = dict(sorted(all_clusters.items(), key=lambda item: item[1], reverse=True))
        write_dict_to_csv(all_clusters, 'all_clusters', out_dir, ['cluster', 'n_users'])
    
    
def make_greedy_modularity_sub_dir(min_count, resolution):
    out_dir = os.path.join(GREEDY_MODULARITY_DIR, 'min_count_' + str(min_count) + '_res_' + str(resolution))
    make_dir(out_dir)
    return out_dir


## Find clusters of related hashtags
- Generate a graph from the hashtags used in every districts
- Find communities of hashtags that are frequently mentioned together

In [None]:
hashtags_per_user_dict = load_dataframes(HASHTAGS_PER_USER_DIR, 2)
frequencies = load_dataframes(HASHTAG_FREQUENCY_DIR)

min_count = 10
resolution = 1.8

out_dir = make_greedy_modularity_sub_dir(min_count, resolution)

for quarter in hashtags_per_user_dict:
    forbidden_hashtags = [quarter]
    df_frequency = frequencies[quarter]
    df_user = hashtags_per_user_dict[quarter]
    find_clusters(out_dir, min_count, quarter, resolution, df_frequency, df_user, forbidden_hashtags)