In [1]:
import time
import pyspark
from graphframes import *
from pyspark.sql.functions import *
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import hash
import os
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.functions import collect_list
import networkx as nx
from connected_components import *
N = 64

In [2]:
#cc = pickle_to_dataframe(sc, spark, 'connected_components.pkl')

In [3]:
def get_largest_cc(cc):
    """ function to get largest connected component id"""
    # cc.sort("component").groupby("component").agg(F.collect_list("id").alias("subreddits")).show()
    largest_component = cc.groupby("component").agg(F.count("id").alias("component_size"))\
                        .orderBy(F.desc("component_size")).first()
    return largest_component.__getitem__('component')
    
def get_new_vertices(cc):
    """ function to get list of subreddits (nodes) in largest connected component """
    largest_component = get_largest_cc(cc)  
    new_vertices = cc.filter("component == {}".format(largest_component)).drop("component")
    valid_vertex_ids = set(new_vertices.select('id').rdd.flatMap(lambda x: x).collect())
    return new_vertices, valid_vertex_ids

def get_new_edges(sc, spark, valid_vertex_ids):
    edges = pickle_to_dataframe(sc, spark, 'all_edges.pkl')
    new_edges = edges.filter(col('src').isin(valid_vertex_ids)).drop("weights")
    return new_edges

def get_communities_lpa(new_graph):
    start = time.time()
    communities = new_graph.labelPropagation(maxIter=7)
    end = time.time()
    diff = end-start
    print(communities.select(countDistinct("label")).take(1))
    return communities, diff

def prepare_data_evaluation(valid_vertex_ids, new_edges, communities):
    G = nx.Graph()
    G.add_nodes_from(list(valid_vertex_ids))
    G.add_edges_from(new_edges.collect())
#     edges_without_weights = new_edges.drop("weights")
#     G.add_edges_from(edges_without_weights.collect())

    # Create a dictionary mapping node IDs to their corresponding communities
    node_communities = {}
    communities = communities.groupBy("label").agg(collect_list("id").alias("nodes"))
    for row in communities.collect():
        for node in row["nodes"]:
            node_communities[node] = row["label"]

    communities_list = communities.collect()
    nodes_list = [frozenset(row["nodes"]) for row in communities_list]

    return G, nodes_list

def evaluate(G, nodes_list):
    modularity = nx.algorithms.community.modularity(G, nodes_list)
    coverage, performance = nx.algorithms.community.quality.partition_quality(G, nodes_list)
    nodes_list_con = [list(x) for x in nodes_list]
    conductance = [nx.algorithms.cuts.conductance(G, cluster_i) for cluster_i in nodes_list_con]  
    c_arr = np.array(conductance)
    conductance_val =  np.min(c_arr)

    return modularity, conductance_val, performance, coverage

In [4]:
#nodes, edges are dataframes
def save_to_gephi(nodes, edges, filename):
    """ function to save from graph frames to gephi
        takes in nodes, edges as df, and filename as string
        converts to network x graph, then writes to gexf"""
    G = nx.DiGraph()
    G.add_nodes_from(list(nodes))
    G.add_edges_from(edges.collect())
#     edges_without_weights = edges.drop("weights")
#     G.add_edges_from(edges_without_weights.collect())
    
    
    nx.write_gexf(G, filename)

In [5]:
def __main__():
    spark = SparkSession.builder.config("spark.memory.offHeap.enabled","true")\
                            .config("spark.memory.offHeap.size","100g")\
                            .config("spark.executor.memory", "100g")\
                            .config("spark.driver.memory", "100g")\
                            .appName("Reddit Community Detection").getOrCreate()
    sc = SparkContext.getOrCreate()
    cc = pickle_to_dataframe(sc, spark, 'connected_components.pkl')
    new_vertices, valid_vertex_ids = get_new_vertices(cc)
    new_edges = get_new_edges(sc, spark, valid_vertex_ids)
    new_graph = create_graph(new_vertices, new_edges, N)
    
    
    save_to_gephi(valid_vertex_ids, new_edges, 'Largest_Component.gexf')
    communities, time_for_LPA = get_communities_lpa(new_graph)
    id_label = communities.select('id', 'label').rdd.map(lambda x: (x[0], {"label": x[1]})).collect()
    
    #print(id_label)
    
    save_to_gephi(id_label, new_edges, 'LPA_communities.gexf')
    print("Label propagation ran for {} seconds.".format(time_for_LPA))
    G, nodes_list = prepare_data_evaluation(valid_vertex_ids, new_edges, communities)
    modularity, conductance, performance, coverage = evaluate(G, nodes_list)
    print("Label Propagation Evaluation Metrics :")
    print("Modularity score = ", modularity)
    print("Conductance = ", conductance)
    print("Performance = ", performance)
    print("Coverage = ", coverage)

In [6]:
__main__()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/17 15:10:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_160, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_47, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_137, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_153, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_101, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_49, which does not exist
23/04/17 15:59:09 WARN BlockManager: Asked to remove block rdd_216_65, which does not exist

[Row(count(DISTINCT label)=1103)]


                                                                                

Label propagation ran for 2832.948624610901 seconds.


                                                                                

Label Propagation Evaluation Metrics :
Modularity score =  0.2857788507156451
Conductance =  0.047619047619047616
Performance =  0.9002525510931688
Coverage =  0.5284927164314288
