## Description
This Jupyter Notebook is used to generate thematic maps based on the extracted terms from publication titles.


# Libraries

In [8]:
import pandas as pd
import random
import igraph as ig

# Data
- dc_terms = extracted keywords from the publications
- df_do_terms = relationship between publications and terms

In [3]:
df_terms = pd.read_csv('data/terms_all.tsv', sep='\t')
df_doc_terms = pd.read_csv('data/doc_terms_all.tsv', sep='\t')

# Co-occurrence network

From the terms and documents, a co-occurrence table of terms is generated with a binary count since they are titles and there shouldn't be many terms.

In [None]:
df_co = df_doc_terms.merge(df_doc_terms, how='inner', on='document id')
df_co = df_co[df_co['term id_x'] != df_co['term id_y']]
# to get an undirected graph matrix
df_co['keyword_1'] = df_co[['term id_x','term id_y']].max(axis=1)
df_co['keyword_2'] = df_co[['term id_x','term id_y']].min(axis=1)
df_co = df_co[['document id', 'keyword_1','keyword_2']].drop_duplicates() # for binary counting
df_co = df_co[['keyword_1','keyword_2']].copy()
df_co

To calculate weighted edge weights, the number of relationships established by each term is calculated.

In [None]:
df_key_freq = pd.DataFrame({'keyword':df_co.keyword_1.tolist() + df_co.keyword_2.tolist()}).value_counts().reset_index()
df_key_freq.rename({'count':'Freq'}, axis=1, inplace=True)
df_key_freq

In [15]:
df_co = df_co.groupby(['keyword_1', 'keyword_2']).size().reset_index()
df_co.rename({0:'weight'}, axis=1, inplace=True)

The weighted weight is calculated as 1 divided by the frequency of occurrence of two terms. The closer it is to 0, the more limited the relationship is.

In [None]:
df_co = df_co.merge(df_key_freq, how='inner', left_on='keyword_1', right_on='keyword')
df_co = df_co.merge(df_key_freq, how='inner', left_on='keyword_2', right_on='keyword')
df_co = df_co[['keyword_1','keyword_2','weight','Freq_x','Freq_y']]
df_co['new_weight'] = 2*df_co['weight']/(df_co['Freq_x']+df_co['Freq_y'])
df_co

Now I include the terms to better identify them when building the network.

In [None]:
df_co_n = df_co.merge(df_terms, how='inner', left_on='keyword_1', right_on='id')
df_co_n = df_co_n.merge(df_terms, how='inner', left_on='keyword_2', right_on='id')
df_co_n

NAs values are removed.

In [None]:
df_co_n = df_co_n[(~df_co_n['term_x'].isna()) & (~df_co_n['term_y'].isna())]
df_co_n

Matrix is filtered to only temrs that co-occurs at least twice.

In [None]:
df_co_n = df_co_n[df_co_n.weight > 1]
df_co_n

# Network and cluster analysis

Network creation including the weighted weights.

In [20]:
g = ig.Graph.DataFrame(df_co_n[['term_x', 'term_y', 'new_weight']], directed=False, use_vids=False)

Cluster detection based on Leiden algorythm.

In [None]:
cluster = g.community_leiden(#objective_function='CPM', parameter=1e-4,
                             objective_function='modularity', resolution=20, # 15 works fine with all 2
                             weights='new_weight',
                             n_iterations=10)
cluster.modularity

Clusters members are skewed.

In [None]:
clusters = pd.Series(cluster.membership)
clusters_freq = clusters.value_counts()
clusters_freq

In [None]:
clusters_freq[clusters_freq>4].plot.hist(bins=20)

Tranform graph vertex into a dataframe.

In [65]:
g.vs()['cluster'] = cluster.membership
g.vs()['degree'] = g.degree()

In [None]:
node_df = pd.DataFrame({attr: g.vs[attr] for attr in g.vertex_attributes()})
node_df.sort_values('degree', ascending=False, inplace=True)
node_df

The edges are added based on the clusters to generate a new network of clusters.

In [69]:
g_cluster = cluster.cluster_graph(combine_edges={
        'new_weight': 'sum',
    })
g_cluster_df = g_cluster.get_edge_dataframe().reset_index()
g_cluster_df.drop('edge ID', axis=1, inplace=True)

Cluster graph is filtered to only clusters with at least five terms to removed isolated topics.

In [72]:
g_cluster_df = g_cluster_df[(g_cluster_df['source'].isin(clusters_freq[clusters_freq>4].index)) &
            (g_cluster_df['target'].isin(clusters_freq[clusters_freq>4].index))]
g_cluster_df

In [74]:
g_cluster_df.to_csv('data/coocurrences_cluster.txt', sep='\t', index=False, header=False)

In [75]:
vertex = pd.DataFrame({attr: g.vs[attr] for attr in g.vertex_attributes()})

In [82]:
vertex = vertex[vertex['cluster'].isin(clusters_freq[clusters_freq>4].index)]

In [83]:
vertex.to_csv('data/vertex.txt', sep='\t', index=False)