In [1]:
import random
import pyspark.sql.functions as F

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.types import *

In [2]:
# Create a spark session/application
spark = SparkSession.builder.appName('Homework5_Exercise1').getOrCreate()
sqlcontext = SQLContext(sc)

In [3]:
# Import web-Google.txt as dataframe using the defined schema
schema = StructType(
    [
        StructField("FromNodeId", IntegerType(), False),
        StructField("ToNodeId", IntegerType(), False)
    ]
)
graph_df = spark.read.format("csv")\
            .option("header", "true")\
            .option("delimiter", "\t")\
            .option("comment", "#")\
            .schema(schema)\
            .load("data/web-Google.txt")

In [16]:
# Get a dictionary of inlinks and outlinks both as lists
def node_connectivity(node_v):
    out_links = graph_df.filter(F.col('FromNodeId') == node_v)
    out_links = [str(out_link['ToNodeId']) for out_link in out_links.collect()]
    
    in_links = graph_df.filter(F.col('ToNodeId') == node_v)
    in_links = [str(in_link['FromNodeId']) for in_link in in_links.collect()]
    
    return {"out_links": out_links, "in_links": in_links}

In [5]:
# Get a list of all nodes
from_nodes = [row["FromNodeId"] for row in graph_df.select("FromNodeId").distinct().collect()]
to_nodes = [row["ToNodeId"] for row in graph_df.select("ToNodeId").distinct().collect()]
nodes = list(set(from_nodes + to_nodes))

In [20]:
# Select a random node v
node_v = random.choice(nodes)

In [21]:
connectivity_v = node_connectivity(node_v)

In [22]:
# Write the output to output/exercise3/{node_v}.csv
with open(f'output/exercise3/{node_v}.txt', 'w+') as output_file:
    output_file.write(",".join(connectivity_v['out_links']) + '\n')
    output_file.write(",".join(connectivity_v['in_links']))