# Distributed Community Scanner

Spark Job for Communitry Scanner

### Initialisation

In [12]:
import os
import sys
from pyspark.sql import SparkSession
from time import time

SPARK_HOME="/home/human/l/rh/platform/spark-2.3.0-bin-hadoop2.7"
DATA = "../files/in/"
OUTPUT = DATA + "out/"

# maximum numbers of edges to be parsed.
EDGES_LIMIT = 2000  

# Level of communities in Girvan-Newman algorithm
# Defines number of communities desired
COMPONENTS_LEVEL = 5  


# Optimisation for object serialisation (override default serializer with Kyro)
session = SparkSession \
    .builder \
    .appName("Distributed-Community-Scanner") \
    .config("spark.sql.autoBroadcastJoinThreshold", '-1') \
    .config("spark.ui.enabled", 'true') \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

# SparkContext is the main connection with Spark driver
sc = session.sparkContext

In [13]:
# Add the parent project to the syspath to be able to import the modules
# https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
# Import modules
from inout.parser import parse
from inout.parser import get_headers
from view.visualiser import draw
from algorithms.shortest_path import single_source_shortest_paths_dijkstra
from distributed_algorithms.shortest_paths import compute_shortest_paths
from distributed_algorithms.betweenness import compute_edge_betweenness
from distributed_algorithms.girvan_newman import girvan_newman_generator
from distributed_algorithms.girvan_newman import girvan_newman

In [14]:
# edges_file = 'edges.csv'

edges_file = 'edges.csv'
edges_path = os.path.join(DATA, edges_file)
headers = get_headers(edges_path)
source_header, target_header, weight_header = headers

# Parse with edges limit:
graph_driver = parse(edges_path, edge_limit=EDGES_LIMIT,
                     source_header=source_header, target_header=target_header, weight_header=weight_header)

number_of_edges = len(list(graph_driver.edges))
number_of_edges 

2000

In [15]:
# Keep this line commented for large graphs
# draw(graph_driver)

### Broadcast Graph and Algorithms to Workers

In [16]:
# Add dependencies to workers
dependencies_path = "community_scanner.zip"
sc.addPyFile(dependencies_path)

### Using Community Scanner Spark Library 

## Get one set of communities at certain level

In [17]:
before_time = time()

components = girvan_newman(sc, graph_driver, COMPONENTS_LEVEL)

elapsed_time = time() - before_time

print('Edges: {0};\t\tTarget Level: {1};\t\t\tDistributed Computing Time (seconds): {2}\n\n'.format(
        number_of_edges, COMPONENTS_LEVEL, elapsed_time
    ))

components

Edges: 2000;		Target Level: 5;			Distributed Computing Time (seconds): 582.8122761249542




({1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157,
  159,
  160,
 

## Get all components (sets of communities)

In [18]:
for components in girvan_newman_generator(sc, graph_driver):
    print('{0:-10d}\t{1}'.format(len(components), components))

         2	({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 211, 212, 213, 214, 215, 216, 217, 219, 220, 221, 222, 223,

KeyboardInterrupt: 