# <font color="#49699E" size=40>Generative Network Analysis with Bayesian Stochastic Blockmodels</font>
# LEARNING OUTCOMES
# LEARNING MATERIALS
# INTRODUCTION


## Latent Network Structure: Connected Communities and Structural Positions


### Equivalence?


### Blockmodels


## Bayesian Hierarchical Stochastic Blockmodels


# BLOCKMODELLING WITH GRAPH-TOOL


## Installing graph-tool


## Understanding Property Maps


## Imports

In [None]:
from graph_tool.all import *
import pandas as pd
pd.set_option("display.notebook_repr_html", False)
import matplotlib
import numpy as np
import math
import pickle
from dcss.networks import label_radial_blockmodel, get_block_membership

## Data


In [None]:
edges_df = pd.read_csv('../data/enron/enron_full_edge_list.csv')
edges_df.head()

In [None]:
employee_df = pd.read_csv('../data/enron/enron_employees_updated.csv')
employee_df.head()

In [None]:
employee_df['position'].value_counts()

### Constructing the Communication Network


In [None]:
edges_df = edges_df.value_counts(['source', 'target']).reset_index(name='count').copy()
core_employees = set(employee_df['id'].tolist())

core_edges_df = edges_df[edges_df['source'].isin(core_employees) & 
                         edges_df['target'].isin(core_employees)]

In [None]:
eG = Graph(directed = True)

In [None]:
employee_list = employee_df['id'].tolist()
title_list = employee_df['position'].tolist()

In [None]:
vertex_lookup = {}

label = eG.new_vertex_property('string')
title = eG.new_vertex_property('string')
edge_weight = eG.new_edge_property('int')

In [None]:
for vertex in zip(employee_list, title_list):
    # create a new vertex instance
    v = eG.add_vertex()

    # add attributes to the property maps in the index position of the vertex
    label[v] = vertex[0]
    title[v] = vertex[1]

    # add the vertex to the lookup dictionary, converting it to an integer 
    vertex_lookup[vertex[0]] = int(v)

In [None]:
source_list = core_edges_df['source'].tolist()
target_list = core_edges_df['target'].tolist()
weight_list = core_edges_df['count'].tolist()

for nodes in zip(source_list, target_list, weight_list):
    from_idx = vertex_lookup[nodes[0]]
    to_idx = vertex_lookup[nodes[1]]

    # Let's ignore self-loops
    if from_idx != to_idx:
        edge = eG.add_edge(from_idx, to_idx)
        edge_weight[edge] = nodes[2]

In [None]:
eG.vertex_properties['label'] = label
eG.vertex_properties['title'] = title
eG.edge_properties['edge_weight'] = edge_weight

lookup = eG.new_graph_property('object')
lookup[eG] = vertex_lookup
eG.graph_properties['vertex_lookup'] = lookup

## Developing Stochastic Blockmodels


In [None]:
state = minimize_nested_blockmodel_dl(eG, deg_corr = True)

In [None]:
state.print_summary()

In [None]:
state.draw(
    layout = "sfdp", 
    vertex_text = eG.vertex_properties['title'], 
    eorder = eG.edge_properties['edge_weight'],
    vertex_text_position = 315,
    bg_color=[255,255,255,1],
    output_size=[4024,4024],
    output='../figures/core_enron_blockmodel_sfdp.png'
    )

In [None]:
state_w = minimize_nested_blockmodel_dl(eG, deg_corr = True, 
                                              state_args=dict(
                                                  recs=[eG.edge_properties['edge_weight']],
                                                  rec_types=["discrete-binomial"]))

In [None]:
state_w.print_summary()

In [None]:
state_w2 = minimize_nested_blockmodel_dl(eG, deg_corr = True, B_min=12, B_max=12,
                                              state_args=dict(
                                                  recs=[eG.edge_properties['edge_weight']],
                                                  rec_types=["discrete-binomial"]))

In [None]:
state_w2.print_summary()

In [None]:
state_w2.draw(
    layout = "sfdp", 
    vertex_text = eG.vertex_properties['title'], 
    eorder = eG.edge_properties['edge_weight'],
    vertex_text_position = 315,
    bg_color=[255,255,255,1],
    output_size=[4024,4024],
    output='../figures/core_enron_blockmodel_sfdpw.png'
    )

In [None]:
employee_blocks_df = get_block_membership(state, eG, employee_df,
                                         'model_uw_1')
employee_blocks_df = get_block_membership(state_w2, eG, employee_blocks_df,
                                         'model_w_2')

In [None]:
df_by_position = employee_blocks_df.groupby('position').agg(list)
df_by_position[df_by_position.index.isin(['CEO','President', 'In House Lawyer'])].head()

In [None]:
employee_blocks_df.groupby(['position'])['model_uw_1_block_id'].agg(lambda x: x.nunique()/x.count())

In [None]:
print(employee_blocks_df.groupby(['position'])['model_uw_1_block_id'].agg(lambda x: x.nunique()/x.count()).sum())
print(employee_blocks_df.groupby(['position'])['model_w_2_block_id'].agg(lambda x: x.nunique()/x.count()).sum())

In [None]:
employee_blocks_df.groupby(['model_uw_1_block_id'])['position'].agg(lambda x: x.nunique()/x.count())

In [None]:
print(employee_blocks_df.groupby(['model_uw_1_block_id'])['position'].agg(lambda x: x.nunique()/x.count()).sum())
print(employee_blocks_df.groupby(['model_w_2_block_id'])['position'].agg(lambda x: x.nunique()/x.count()).sum())

In [None]:
from sklearn.metrics import homogeneity_score, completeness_score, adjusted_mutual_info_score

In [None]:
homogeneity_score(employee_blocks_df['position'], employee_blocks_df['model_uw_1_block_id'])

In [None]:
homogeneity_score(employee_blocks_df['position'], employee_blocks_df['model_w_2_block_id'])

In [None]:
completeness_score(employee_blocks_df['position'], employee_blocks_df['model_uw_1_block_id'])

In [None]:
completeness_score(employee_blocks_df['position'], employee_blocks_df['model_w_2_block_id'])

In [None]:
adjusted_mutual_info_score(employee_blocks_df['position'], employee_blocks_df['model_uw_1_block_id'])

In [None]:
adjusted_mutual_info_score(employee_blocks_df['position'], employee_blocks_df['model_w_2_block_id'])

In [None]:
adjusted_mutual_info_score(employee_blocks_df['model_w_2_block_id'], employee_blocks_df['model_uw_1_block_id'])

##  Model Selection and Optimization


In [None]:
states = [minimize_nested_blockmodel_dl(eG, deg_corr=True) 
          for n in range(10)]

for s in states:
    print(s.entropy())

In [None]:
state = states[np.argmin([s.entropy() for s in states])]

## More MCMC

In [None]:
S1 = state.entropy()
S1

In [None]:
state = state.copy(bs=state.get_bs() + [np.zeros(1)] * 4,sampling = True)

In [None]:
bs = []

# our callback function that appends each estimated blockstate to the array
def collect_partitions(s):
    global bs
    bs.append(s.get_bs())
        
mcmc_equilibrate(state, force_niter=10000, mcmc_args=dict(niter=10), callback=collect_partitions)

In [None]:
state.entropy() - S1

In [None]:
pmode = PartitionModeState(bs, nested=True, converge=True)

pv = pmode.get_marginal(eG)
eG.vertex_properties['pv'] = pv

In [None]:
bs = pmode.get_max_nested()
state = state.copy(bs=bs)
state.entropy()

In [None]:
employee_blocks_df = get_block_membership(state, eG, employee_blocks_df, 'model_uw_mcmc')
homogeneity_score(employee_blocks_df['position'], employee_blocks_df['model_uw_mcmc_block_id'])

In [None]:
completeness_score(employee_blocks_df['position'], employee_blocks_df['model_uw_mcmc_block_id'])

In [None]:
adjusted_mutual_info_score(employee_blocks_df['position'], employee_blocks_df['model_uw_mcmc_block_id'])

## Visualizing Block Connections as a Radial Tree


In [None]:
state.draw()

In [None]:
eG = label_radial_blockmodel(eG, state)

In [None]:
state.draw(
    vertex_text = eG.vertex_properties['title'], 
    eorder = eG.edge_properties['edge_weight'],
    vertex_shape='pie',
    vertex_pie_fractions=eG.vertex_properties['pv'],
    edge_control_points = eG.edge_properties['cts'],
    pos=eG.vertex_properties['pos'], 
    vertex_size=10, 
    edge_pen_width = 0.2,
    bg_color=[255,255,255,1],
    vertex_text_rotation=eG.vertex_properties['text_rot'],
    vertex_text_position=0,
    output='../figures/core_state_radial_tree_labels.pdf'
    )

## TopSBM: A Unified Bayesian Approach to Latent Variable Modelling for Text and Networks


In [None]:
topSBM_model = pickle.load( open( '../data/pickles/can_hansard_100k_sample_topSBM.pkl', 'rb'))

In [None]:
topic_dict = topSBM_model.topics(l=1,n=20)

In [None]:
df_list = []
for topic in [76,91,200,228,104,126]:
    df = pd.DataFrame.from_records(topic_dict[topic], columns = ['words_' + str(topic), 'scores_' + str(topic)])
    df_list.append(df)
topic_df = pd.concat(df_list, axis=1)
topic_df.head(20)

# CONCLUSION
## Key Points 
