# Import stuff here

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.preprocessing import Normalizer
from gensim import corpora, models, matutils
from gensim.models.ldamulticore import LdaMulticore
from scipy.spatial.distance import cdist
from pprint import pprint
import itertools
from copy import deepcopy
from itertools import chain
import scipy
import os

In [2]:
from input_values import (
    TV_SHOW, 
    PRE_PROCESSED_FILE_NAME, 
    LDA_FILE_NAME, 
    OUT_DIR, 
    GAMMA, 
    TOLERANCE, 
    ITERATIONS, 
    NUM_TOPICS,
    GRAPHML_FILE,
    GRAPH_NODE_FILE,
    TOPIC_VEC_FILE,
    TWITTER_RANK_FILE
)

In [3]:
TV_SHOW, PRE_PROCESSED_FILE_NAME,GRAPHML_FILE, GRAPH_NODE_FILE, TOPIC_VEC_FILE, TWITTER_RANK_FILE, GRAPHML_FILE

('YouNetflix_new',
 'YouNetflix_new',
 '../tvshows/output/YouNetflix_new.graphml',
 'YouNetflix_new_graph_nodes_with_no_isolated_nodes.csv',
 'YouNetflix_new_topic_frame.csv',
 'YouNetflix_new_topic_rank_frame.csv',
 '../tvshows/output/YouNetflix_new.graphml')

In [4]:
NUM_OF_INFLUENTIAL_NODES = 100
threshold_percentile_for_merge = 25
threshold_percentile_for_split = 25

# Import the graph

In [5]:
def remove_isolated_nodes(G):
    print(nx.info(G))
    isolated_nodes = list(nx.isolates(G))
    print('\nIsolated nodes: {}\n'.format(len(isolated_nodes)))
    print('removing isolated nodes...\n')
    G.remove_nodes_from(isolated_nodes)
    #G = G.remove_edges_from(nx.selfloop_edges(G))
    return G

def load_graph(graph_file = GRAPHML_FILE):

    graph = nx.read_graphml(GRAPHML_FILE)
    print(nx.info(graph))
    
    return graph

def write_graph(G, graph_file):
    nx.write_graphml(G, graph_file)
    
def add_weights(graph):
    degree_list = ['retweet_count', 'mention_count', 'reply_count', 'quote_count']
    attrs = {}
    for (node1,node2,*data) in graph.edges(data=True):
        weight = sum([value for key, value in data[0].items() if key in degree_list])
        attrs[(node1, node2)] = {'weight': weight}
    nx.set_edge_attributes(graph, attrs)
    return graph

# Import dataframes :-
## user_id_df, graph_nodes_df, twitter_rank_df, topic_vec_df

In [6]:
def get_topic_frame(file, graph_node_file=GRAPH_NODE_FILE):
    graph = pd.read_csv(graph_node_file)
    graph = graph.rename(columns = {'Unnamed: 0':'userid'})
    topic = pd.read_csv(file)
    topic = topic.rename(columns = {'Unnamed: 0':'userid'})
    columns = topic.columns
    new = pd.merge(graph,topic,on = 'userid',how = 'left')
    dic = {'0_x':'0_y','1_x':'1_y','2_x':'2_y','3_x':'3_y','4_x':'4_y','5_x':'5_y',
           '6_x':'6_y','6_x':'6_y','7_x':'7_y','8_x':'8_y','9_x':'9_y'}
    for i in dic:
        new[i] = new[i] + new[dic[i]]
    new = new.drop(columns = dic.values())
    new.fillna(0.1, inplace=True)
    new.set_index('userid', inplace=True)
    return new

In [7]:
pd.read_csv(GRAPH_NODE_FILE).index.dtype

dtype('int64')

# Community Detection algorithm below

In [8]:
def get_influential_nodes(graph, df, twitter_rank_df, num_topics=NUM_TOPICS, 
                          num_of_influential_nodes=NUM_OF_INFLUENTIAL_NODES):
    topic_rank = twitter_rank_df.values
    topic_rank_sum = np.sum(topic_rank/num_topics, axis=1)
    average_twitter_rank = np.array(topic_rank_sum[:])

    df['avg_twitter_rank'] = average_twitter_rank
    influential_nodes_index = np.argsort(average_twitter_rank, axis=0).reshape(
        len(average_twitter_rank),1)[:num_of_influential_nodes, :]

    influential_nodes_index = [int(i) for i in influential_nodes_index]
    influential_nodes = df.iloc[influential_nodes_index].index.tolist()
    
    return influential_nodes

In [9]:
# Community detection algorithm here

def step1_assign_initial_communities(graph, df, influential_nodes):
    # Initial community assignment
    influential_nodes = [int(i) for i in influential_nodes]
    communities = {k: [k] for k in influential_nodes}

    # Remaining nodes
    remaining_nodes = set(df.index) - set(influential_nodes)

    # Get the weighted adjacency matrix
    adjacency_matrix = nx.adjacency_matrix(graph, weight='weight').todense()
    adjacency_matrix_df = pd.DataFrame(data=adjacency_matrix, index=df.index, 
                                       columns=df.index)
    #print(adjacency_matrix_df)

    # Get nodes weight with influential users
    nodes_weight_with_influential_nodes_df = adjacency_matrix_df[influential_nodes]
    nodes_weight_with_influential_nodes_df['max_weight_with_influencer'] = nodes_weight_with_influential_nodes_df[
        nodes_weight_with_influential_nodes_df > 0].idxmax(axis=1)
    nodes_weight_with_influential_nodes_df['max_weight_with_influencer'].fillna(False, inplace=True)
    # Main algorithm here


    for node in remaining_nodes:
      # add this node to an influencer’s community if this influencer 
      # and this node have the highest edge weight
      influencer_with_max_weight_with_node = nodes_weight_with_influential_nodes_df[
          'max_weight_with_influencer'].loc[node]
      if isinstance(influencer_with_max_weight_with_node, np.int64):
        communities[influencer_with_max_weight_with_node].append(node)
    
    return communities

In [10]:
def get_cosine_sim_df(df):
    topic_vectors = df.values
    #print(df.values.shape)
    norm_topic_vectors = topic_vectors / np.linalg.norm(topic_vectors, axis=-1)[:, np.newaxis]
    cosine_sim = np.dot(norm_topic_vectors, norm_topic_vectors.T)
    cosine_sim_df = pd.DataFrame(data = cosine_sim, index=df.index, columns=df.index)
    return cosine_sim_df

def step2_split_community(communities, threshold_percentile_for_split, cosine_sim_df):

    # Split these initial communities based on topic vectors.  
    # Given a community of m nodes, we can compute the pairwise cosine-distance of 
    # the topical vectors. This will give us m(m-2)/2 distances.  
    # We then remove a node if its cosine distances from all its neighbors 
    # are below a threshold, say, the first quartile of all the m(m-2)/2 distances.


    #topic_vectors = df.values
    #cosine_sim = np.array([]).reshape(len(df),0)
    #if os.path.isfile(TV_SHOW + '_cosine_sim_frame.csv'+ str(len(communities)) +
    #                  str(threshold_percentile_for_split)+ '.csv'):
    #    cosine_sim_df = 
    #for k, node in enumerate(df.index.tolist()):
    #  #cosine_sim = np.c_[cosine_sim, 1 - cdist(topic_vectors, np.matrix(df.loc[node])[:,:NUM_TOPICS], 
    #  #                                         metric='cosine')]
    #  b = np.matrix(df.loc[node])[:,:NUM_TOPICS]
    #  cosine_sim = np.c_[cosine_sim , np.dot(topic_vectors, b.T) / np.outer(np.linalg.norm(topic_vectors, axis=1), 
    #                                                                        np.linalg.norm(b, axis=1))]
    #  if k % 5000 == 0:
    #        print('cosine_sim with {} nodes done'.format(k))
    #cosine_sim_df = pd.DataFrame(data=cosine_sim, index=df.index, columns=df.index)
    
    #topic_vectors = df.values
    #print(df.values.shape)
    #norm_topic_vectors = topic_vectors / np.linalg.norm(topic_vectors, axis=-1)[:, np.newaxis]
    #cosine_sim = np.dot(norm_topic_vectors, norm_topic_vectors.T)
    #cosine_sim_df = pd.DataFrame(data = cosine_sim, index=df.index, columns=df.index)
    #cosine_sim_df.to_csv(TV_SHOW + '_cosine_sim_frame.csv'+ str(len(communities)) + 
    #                     str(threshold_percentile_for_split)+ '.csv')
    mapped_nodes_in_communities = list(itertools.chain(*communities.values()))
    community_cosine_sim_df = cosine_sim_df.loc[mapped_nodes_in_communities][mapped_nodes_in_communities]
    community_cosine_sim_df.to_csv(TV_SHOW + '_community_cosine_sim_frame'+ str(len(communities)) + 
                                   str(threshold_percentile_for_split)+ '.csv')
    split_threshold = np.percentile(community_cosine_sim_df.values, threshold_percentile_for_split)
    #splitting here
    updated_communities = deepcopy(communities)
    for seed_node, community in communities.items():
      if len(community) == 1:
        #print('Cannot split for community since it has community: {} since it has only one node'.format(community))
        continue
      else:
        # split the community based on topic vectors within a community
        for count_nodes, community_node in enumerate(community):
          if community_node == seed_node:
            continue
          is_cos_dist_bigger_than_threshold = list(community_cosine_sim_df.loc[community_node] > split_threshold)
          if False in is_cos_dist_bigger_than_threshold:
            #print('Splitting node: {} from community: {}'.format(community_node, updated_communities[seed_node]))
            updated_communities[community_node] = [community_node]
            updated_communities[seed_node].remove(community_node)
    
    return updated_communities, community_cosine_sim_df, mapped_nodes_in_communities


In [11]:
def get_min_cosine_distance(comm1, community_cosine_sim_df, merge_threshold, mapped_nodes_in_communities):
  if not isinstance(comm1, set):
    comm1 = set(comm1)
  remaining_list = list(set(mapped_nodes_in_communities)-comm1)
  #print('*****',comm1)
  min_cosine_dist_from_comm1 = community_cosine_sim_df[community_cosine_sim_df.index.isin(
      list(comm1))][remaining_list].idxmax(axis=1).values
  #print('*****',min_cosine_dist_from_comm1)
  min_distance_list = []
  for i in zip(comm1, min_cosine_dist_from_comm1):
    min_distance_list.append(community_cosine_sim_df[i[0]][i[1]])
  #print('*****',min_distance_list)
  community_to_merge = None
  if min(min_distance_list) > merge_threshold:
    community_to_merge = min_cosine_dist_from_comm1[min_distance_list.index(min(min_distance_list))]
  return community_to_merge

def step3_merge_communities(updated_communities, community_cosine_sim_df, threshold_percentile_for_merge, 
                            mapped_nodes_in_communities):
    
    merge_threshold = np.percentile(community_cosine_sim_df.values, threshold_percentile_for_split)
    test_communities = deepcopy(updated_communities)
    #print(test_communities)
    for seed_node, community in updated_communities.items():
      if seed_node in test_communities:
        community_to_merge_with_seed = get_min_cosine_distance(community, community_cosine_sim_df, 
                                                               merge_threshold, mapped_nodes_in_communities)
        #print(seed_node, community_to_merge_with_seed)
        if community_to_merge_with_seed:
          seed_node_to_merge = [key for key, value in test_communities.items() 
                                if community_to_merge_with_seed in value][0]
          #print(seed_node, test_communities)
          merging_communities = test_communities.pop(seed_node_to_merge, None)
          #print(seed_node, merging_communities)
          if merging_communities:
            test_communities[seed_node].extend(merging_communities)
    
    return test_communities

In [12]:
def make_partitions(df, communities):
    partitions = dict()

    for k, v in communities.items():
      for i in v:
        partitions[int(i)] = int(k) 
        
    for i in set(df.index)-set(partitions.keys()):
        partitions[i] = i
    
    return partitions

## Evaluation

In [13]:
def get_conductance(graph, partitions):
  conductances_list = []
  conductances_keys = []
  for key, coms in partitions.items():
    try:
        conductances_list.append(nx.conductance(graph, coms))
    except ZeroDivisionError:
        pass
    else:
        conductances_keys.append(key)
  conductance_measures = dict(zip(conductances_keys, conductances_list))
  if not conductance_measures:
        return {}
  return {'all_conductances': conductance_measures, 'min_conductance': min(conductance_measures.values()),
          'max_conductance': max(conductance_measures.values()), 
          'avg_conductance': sum(conductance_measures.values())/len(partitions)}

def get_triangle_participation_ratio(graph, partitions):
  if nx.is_directed(graph):
    graph = nx.to_undirected(graph)
  tpr_measures = dict(zip(partitions.keys(), [triangle_participation_ratio(graph, coms) 
                                              for coms in partitions.values()]))
  return {'all_tprs': tpr_measures, 'min_tpr': min(tpr_measures.values()),
          'max_tpr': max(tpr_measures.values()), 'avg_tpr': sum(tpr_measures.values())/len(partitions)}


def triangle_participation_ratio(graph, coms):
  cls = nx.triangles(graph, coms)
  #print(cls)
  nc = [n for n in cls if cls[n] > 0]
  #print(nc)
  return float(len(nc))/len(coms)


def get_community_modularity(graph, partitions):
  modularities = {}
  try:
      modularities = {'modularity': nx.algorithms.community.modularity(graph, [set(com) 
                                                                               for com in partitions.values()])}
  except:
    return {}
  return modularities

def get_surprise(graph, partitions):
  m = graph.number_of_edges()
  n = graph.number_of_nodes()

  q = 0
  qa = 0
  sp = 0

  for community in partitions.values():
      c = nx.subgraph(graph, community)
      mc = c.number_of_edges()
      nc = c.number_of_nodes()

      q += mc
      qa += scipy.special.comb(nc, 2, exact=True)
  try:
      q = q / m
      qa = qa / scipy.special.comb(n, 2, exact=True)

      sp = m * (q * np.log(q / qa) + (1 - q) * np.log2((1 - q) / (1 - qa)))
  except ZeroDivisionError:
      pass
  return {'asymptotic_surprise': sp}

def get_significance(graph, partitions):

  m = graph.number_of_edges()

  binom = scipy.special.comb(m, 2, exact=True)
  p = m / binom

  q = 0
  #print(binom, p, q)

  for community in partitions.values():
      try:
          c = nx.subgraph(graph, community)
          nc = c.number_of_nodes()
          mc = c.number_of_edges()

          binom_c = scipy.special.comb(nc, 2, exact=True)
          
          pc = mc / binom_c
          if pc < 1:
              #print(community, nc, binom_c, pc, np.log((1 - pc) / (1 - p)))
              q += binom_c * (pc * np.log(pc / p) + (1 - pc) * np.log((1 - pc) / (1 - p)))

      except ZeroDivisionError:
          pass
  return {'significance': q}

def get_number_communities(partitions):
    if isinstance(partitions, dict):
        return {'num_communities': len(partitions)}
    else:
        return {'num_communities': None }

def get_communities(partitions):
    if isinstance(partitions, dict):
        return {'communities': partitions }
    else:
        return {'communities': None }
    
def get_evaluation_metrics(graph, partitions):
  return dict(chain(get_communities(partitions).items(), 
                    get_number_communities(partitions).items(),
                    get_conductance(graph, partitions).items(), 
                    get_triangle_participation_ratio(graph, partitions).items(), 
                    get_community_modularity(graph, partitions).items(), 
                    get_surprise(graph, partitions).items(),
                    get_significance(graph, partitions).items())
             )
  

## YouNetflix

In [30]:
NUM_OF_INFLUENTIAL_NODES = 100
threshold_percentile_for_merge = 25
threshold_percentile_for_split = 25

you_graph = load_graph(GRAPHML_FILE)
you_graph = remove_isolated_nodes(you_graph)
you_graph = add_weights(you_graph)
you_graph = nx.relabel_nodes(you_graph, lambda x:int(x))

evaluation_scores = {}
num_nodes = [5**i for i in range(1, you_graph.number_of_nodes()) if 5**i < 100000]
print(num_nodes)
for node_range in num_nodes:
    for threshold in [10, 25, 50, 75, 100] :
        you_topic_vec_df = get_topic_frame(TOPIC_VEC_FILE, GRAPH_NODE_FILE)
        you_twitter_rank_df = get_topic_frame(TWITTER_RANK_FILE, GRAPH_NODE_FILE)
        you_influential_nodes = get_influential_nodes(you_graph, you_topic_vec_df, you_twitter_rank_df, 
                                                      num_topics=NUM_TOPICS, num_of_influential_nodes=node_range)
        print('length of influential nodes: {}'.format(len(you_influential_nodes)))
        you_initial_communities = step1_assign_initial_communities(you_graph, you_topic_vec_df, 
                                                                   you_influential_nodes)
        cosine_sim_df = get_cosine_sim_df(you_topic_vec_df)
        communities_after_split, community_cosine_sim_df, mapped_nodes = step2_split_community(
            you_initial_communities, threshold, cosine_sim_df)
        you_communties_after_merge = step3_merge_communities(communities_after_split, community_cosine_sim_df,
                                                 threshold, mapped_nodes)
        you_partitions = make_partitions(you_topic_vec_df, you_communties_after_merge)
        you_coms_dict = deepcopy(you_communties_after_merge)
        evaluation_scores['num_nodes_{}_threshold_{}'.format(node_range, threshold)] = get_evaluation_metrics(
            you_graph, you_coms_dict)

Name: 
Type: DiGraph
Number of nodes: 14660
Number of edges: 13932
Average in degree:   0.9503
Average out degree:   0.9503
Name: 
Type: DiGraph
Number of nodes: 14660
Number of edges: 13932
Average in degree:   0.9503
Average out degree:   0.9503

Isolated nodes: 2065

removing isolated nodes...

[5, 25, 125, 625, 3125]
length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 25


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 3125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 3125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 3125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 3125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


length of influential nodes: 3125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [39]:
column_list = ['num_communities', 'min_conductance', 'max_conductance', 'avg_conductance', 'asymptotic_surprise',
              'significance']
eval_df = pd.DataFrame.from_dict(evaluation_scores, orient='index', columns=column_list)

In [68]:
eval_df.sort_index(ascending=True, axis=1)
eval_df['length'] = eval_df.index.str.len()
eval_df.sort_values('length', ascending=True, inplace=True)

## CD Algorithms

In [113]:
# Define all community Detection Algorithms
from igraph import *


        
def describe(G, comms):
    print('Number of nodes: ', G.vcount())
    print('Number of edges: ', G.ecount())
    print(describe(self.G))

def remove_self_loops(G, weights=None):
    # If graph has self-loops, remove the self-loops
    if not G.is_simple():
        G = G.simplify(loops = True, multiple = True, 
                                 combine_edges = dict(weight=weights))
    return G


def convert_to_undirected(g, weights=None):
    # If graph is directed, convert to undirected
    #print(summary(g))
    if g.is_directed():
        g =  g.as_undirected(mode = 'collapse', combine_edges = "sum")
    return g

def fast_greedy(G, weights=None):
    g = G.copy()
    g = convert_to_undirected(g)
    summary(g)
    comms = g.community_fastgreedy(weights=weights)
    print('Number of communities: ', comms.optimal_count)
    return comms

def edge_betweenness(G, weights=None):
    g = G.copy()
    comms = g.community_edge_betweenness(clusters=None, directed=True, weights=weights)
    print('Number of communities: ', comms.optimal_count)
    return comms

def infomap(G, edge_weights=None, vertex_weights=None, trials=10):
    g = G.copy()
    comms = g.community_infomap(edge_weights=edge_weights, vertex_weights=vertex_weights, trials=10)
    print('Number of communities: ', comms._len)
    return comms

def label_propagation(G, weights=None, initial=None, fixed=None):
    g = G.copy()
    comms = g.community_label_propagation(weights=weights, initial=initial, fixed=fixed)
    print('Number of communities: ', comms._len)
    return comms

def leading_eigen_vector(G, clusters=-1, arpack_options=None, weights=None):
    g = G.copy()
    g = convert_to_undirected(g)
    comms = g.community_leading_eigenvector(clusters=clusters, arpack_options=arpack_options, weights=weights)
    print('Number of communities: ', comms._len)
    return comms

def multilevel(G, weights=None, return_levels=True):
    g = G.copy()
    g = convert_to_undirected(g)
    summary(g)
    #print(summary(g))
    comms = g.community_multilevel(weights=weights, return_levels=return_levels)
    print('Number of communities: ', len(comms))
    return comms

def optimal_modularity(G, weights=None):
    g = G.copy()
    g = convert_to_undirected(g)
    print(summary(g))
    #g = self.convert_to_undirected(g)
    #print(summary(g))
    comms = g.community_optimal_modularity(weights=weights)
    print('Number of communities: ', comms._len)
    return comms

def spinglass(G, weights=None, spins=25, parupdate=False, start_temp=1, stop_temp=0.01,
              cool_fact=0.99, update_rule="simple", gamma=1, implementation="orig"):
    g = G.copy()
    comms = g.community_spinglass(weights=weights, spins=spins, parupdate=parupdate, start_temp=start_temp,
                                 stop_temp=stop_temp, cool_fact=cool_fact, update_rule=update_rule, gamma=gamma,
                                 implementation=implementation)
    print('Number of communities: ', comms._len)
    return comms


def walktrap(G, weights=None, steps=4):
    g = G.copy()
    #g = self.convert_to_undirected(g)
    summary(g)
    comms = g.community_walktrap(weights=weights, steps=steps)
    print('Number of communities: ', comms.optimal_count)
    return comms

def louvain(G):
    # TO DO: use python-louvain to implement louvain algorithm
    pass

def leiden(G):
    # TO DO: use python-leidenalg to implement louvain algorithm
    pass

def plot_communities(G, comms):
    visual_style = {}
    visual_style["vertex_size"] = 10
    #visual_style["vertex_label"] = graph.vs["id"]
    visual_style['edge_arrow_size'] = 0.5
    visual_style["layout"] = G.layout('fr')
    plot(comms, **visual_style)

def modularity(G, clustering_algorithm):
    return G.modularity(clustering_algorithm)

In [100]:
write_graph(you_graph, 'YouNetflix_preprocessed.graphml')

In [101]:
community_graph = Graph.Read_GraphML('YouNetflix_preprocessed.graphml')
summary(community_graph)
# visual_style = {}
# visual_style["vertex_size"] = 10
# #visual_style["vertex_label"] = graph.vs["id"]
# visual_style['edge_arrow_size'] = 0.5
# visual_style["layout"] = graph.layout('fr')
# plot(graph, mark_groups=True, **visual_style)

IGRAPH D-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)


In [108]:
edge_betweenness_comms_with_weights = edge_betweenness(community_graph, weights = 'weight')

Number of communities:  6025


In [109]:
edge_betweenness_comms = edge_betweenness(community_graph, weights = None)

Number of communities:  6140


In [114]:
fast_greedy_comms = fast_greedy(community_graph, weights = None)

IGRAPH U-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  13


In [115]:
fast_greedy_comms_with_weights = fast_greedy(community_graph, weights = 'weight')

IGRAPH U-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  13


In [116]:
infomap_comms = infomap(community_graph, edge_weights = None)

Number of communities:  1


In [117]:
infomap_comms_with_weights = infomap(community_graph, edge_weights = 'weight')

Number of communities:  1


In [118]:
label_prop_comms = label_propagation(community_graph, weights = None)

Number of communities:  729


In [119]:
label_prop_comms_with_weights = label_propagation(community_graph, weights = 'weight')

Number of communities:  729


In [120]:
leading_eigenvector_comms = leading_eigen_vector(community_graph, weights = None)

Number of communities:  10


In [121]:
leading_eigenvector_comms_with_weights = leading_eigen_vector(community_graph, weights = 'weight')

Number of communities:  17


In [122]:
multilevel_comms = multilevel(community_graph, weights = None, return_levels=False)

IGRAPH U-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  13


In [123]:
multilevel_comms_with_weights = multilevel(community_graph, weights = 'weight', return_levels=False)

IGRAPH U-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  13


In [None]:
# giant_sub_graph = graph.clusters().giant()
# community_sub_graph = Communities(giant_sub_graph)
# spinglass_comms = community_sub_graph.spinglass(weights = None)

In [None]:
#spinglass_comms_with_weights = community_sub_graph.spinglass(weights = 'weight')

In [124]:
walktrap_comms = walktrap(community_graph, weights = None, steps=4)

IGRAPH D-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  11


In [125]:
walktrap_comms_with_weights = walktrap(community_graph, weights = 'weight', steps=4)

IGRAPH D-W- 12595 13932 -- 
+ attr: create_time (v), id (v), create_time (e), mention_count (e), quote_count (e), reply_count (e), retweet_count (e), weight (e)
Number of communities:  13


In [72]:
eval_df

Unnamed: 0,num_communities,min_conductance,max_conductance,avg_conductance,asymptotic_surprise,significance,length
num_nodes_5_threshold_75,3,0.998775,0.999597,0.999221,31.590904,0.0,24
num_nodes_5_threshold_50,3,0.998775,0.999597,0.999221,31.590904,0.0,24
num_nodes_5_threshold_25,3,0.998775,0.999597,0.999221,31.590904,0.0,24
num_nodes_5_threshold_10,3,0.998775,0.999597,0.999221,31.590904,0.0,24
num_nodes_25_threshold_10,4,0.994233,0.999455,0.99804,130.132922,114.992713,25
num_nodes_25_threshold_25,4,0.994233,0.999455,0.99804,130.132922,114.992713,25
num_nodes_25_threshold_50,4,0.994233,0.999455,0.99804,130.132922,114.992713,25
num_nodes_25_threshold_75,4,0.994233,0.999455,0.99804,130.132922,114.992713,25
num_nodes_5_threshold_100,3,0.998775,0.999597,0.999221,31.590904,0.0,25
num_nodes_125_threshold_10,125,0.993243,1.0,0.08788,0.0,0.0,26


In [81]:
edge_betweenness_comms

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__plot__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_convert_matrix_to_tuple_repr',
 '_graph',
 '_item_box_size',
 '_merges',
 '_modularity_params',
 '_names',
 '_nitems',
 '_nmerges',
 '_optimal_count',
 '_plot_item',
 '_traverse_inorder',
 'as_clustering',
 'format',
 'merges',
 'names',
 'optimal_count',
 'summary']

In [89]:
edge_betweenness_comms.as_clustering()

<igraph.clustering.VertexClustering at 0x12b2ca9d0>

In [127]:
dir(graph)

['Adjacency',
 'Asymmetric_Preference',
 'Atlas',
 'Barabasi',
 'Bipartite',
 'De_Bruijn',
 'Degree_Sequence',
 'DictList',
 'Erdos_Renyi',
 'Establishment',
 'Famous',
 'Forest_Fire',
 'Formula',
 'Full',
 'Full_Bipartite',
 'Full_Citation',
 'GRG',
 'Growing_Random',
 'Incidence',
 'Isoclass',
 'K_Regular',
 'Kautz',
 'LCF',
 'Lattice',
 'Load',
 'Preference',
 'Random_Bipartite',
 'Read',
 'Read_Adjacency',
 'Read_DIMACS',
 'Read_DL',
 'Read_Edgelist',
 'Read_GML',
 'Read_GraphDB',
 'Read_GraphML',
 'Read_GraphMLz',
 'Read_Lgl',
 'Read_Ncol',
 'Read_Pajek',
 'Read_Pickle',
 'Read_Picklez',
 'Recent_Degree',
 'Ring',
 'SBM',
 'Star',
 'Static_Fitness',
 'Static_Power_Law',
 'Tree',
 'TupleList',
 'Watts_Strogatz',
 'Weighted_Adjacency',
 '_Bipartite',
 '_Full_Bipartite',
 '_GRG',
 '_Incidence',
 '_Random_Bipartite',
 '__add__',
 '__and__',
 '__bool__',
 '__class__',
 '__coerce__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__'

In [136]:
dir(edge_betweenness_comms)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__plot__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_convert_matrix_to_tuple_repr',
 '_graph',
 '_item_box_size',
 '_merges',
 '_modularity_params',
 '_names',
 '_nitems',
 '_nmerges',
 '_optimal_count',
 '_plot_item',
 '_traverse_inorder',
 'as_clustering',
 'format',
 'merges',
 'names',
 'optimal_count',
 'summary']

In [130]:
infomap_comms.modularity

0.0

In [138]:
com_algorithms = [edge_betweenness_comms, edge_betweenness_comms_with_weights, fast_greedy, fast_greedy_comms, 
                  infomap_comms, infomap_comms_with_weights, label_prop_comms, label_prop_comms_with_weights,
                  leading_eigenvector_comms, leading_eigenvector_comms_with_weights, multilevel_comms,
                  multilevel_comms_with_weights, walktrap_comms, walktrap_comms_with_weights]

In [None]:
for com_algo in com_algorithms:
    try:
        print('Community: {}, Modularity: {}'.format())

In [139]:
from cdlib import algorithms

In [150]:
coms = algorithms.walktrap(you_graph)

OverflowError: integer too large for conversion to C int

In [14]:
you_new_graph = load_graph(GRAPHML_FILE)
you_new_graph = remove_isolated_nodes(you_new_graph)
you_new_graph = add_weights(you_new_graph)
you_new_graph = nx.relabel_nodes(you_new_graph, lambda x:int(x))

Name: 
Type: DiGraph
Number of nodes: 1048650
Number of edges: 3369021
Average in degree:   3.2127
Average out degree:   3.2127
Name: 
Type: DiGraph
Number of nodes: 1048650
Number of edges: 3369021
Average in degree:   3.2127
Average out degree:   3.2127

Isolated nodes: 112586

removing isolated nodes...



936064

In [None]:
evaluation_scores = {}
#total_nodes = you_new_graph.number_of_nodes()
#num_nodes = [5**i for i in range(1, total_nodes) if 5**i < 100000]
num_nodes = [5, 25, 125, 625, 3125, 15625]
print(num_nodes)
for node_range in num_nodes:
    for threshold in [10, 25, 50, 75, 100] :
        you_new_topic_vec_df = get_topic_frame(TOPIC_VEC_FILE, GRAPH_NODE_FILE)
        you_new_twitter_rank_df = get_topic_frame(TWITTER_RANK_FILE, GRAPH_NODE_FILE)
        you_new_influential_nodes = get_influential_nodes(you_new_graph, 
                                                          you_new_topic_vec_df, 
                                                          you_new_twitter_rank_df, 
                                                          num_topics=NUM_TOPICS, 
                                                          num_of_influential_nodes=node_range)
        print('length of influential nodes: {}'.format(len(you_new_influential_nodes)))
        you_new_initial_communities = step1_assign_initial_communities(you_new_graph, you_new_topic_vec_df, 
                                                                   you_new_influential_nodes)
        cosine_sim_df = get_cosine_sim_df(you_new_topic_vec_df)
        communities_after_split, community_cosine_sim_df, mapped_nodes = step2_split_community(
            you_new_initial_communities, threshold, cosine_sim_df)
        you_new_communties_after_merge = step3_merge_communities(communities_after_split, community_cosine_sim_df,
                                                 threshold, mapped_nodes)
        you_new_partitions = make_partitions(you_new_topic_vec_df, you_new_communties_after_merge)
        you_new_coms_dict = deepcopy(you_new_communties_after_merge)
        evaluation_scores['num_nodes_{}_threshold_{}'.format(node_range, threshold)] = get_evaluation_metrics(
            you_new_graph, you_new_coms_dict)

[5, 25, 125, 625, 3125, 15625]
length of influential nodes: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
