### Part-3 Implement and analyze Page-rank algorithm.
1. You must write a basic page-rank algorithm considering the text file that is generated 
(question3.txt). It is a simulated network of 100 pages and its hyperlink .
The algorithm should take the network provided and evaluate the page rank for all the 
webpages or nodes.
2. Find the node with the highest and the lowest page rank and provide a screenshot of the 
same.


In [2]:
from pyspark import SparkContext, SparkConf
import os

# Set environment variables for PySpark to use Python
os.environ['PYSPARK_PYTHON']='python'
os.environ['PYSPARK_DRIVER_PYTHON']='python'

#saving the results to a text file
def write_results(rdd_data,output_file_name):
    output_data=rdd_data.collect()
    with open(output_file_name,'w') as file:
        for record in output_data:
            file.write(f"{record}\n")

#calculating contributions from each node to its neighbors
def compute_page_contributions(pair):
    neighbors,rank_value=pair[1]
    if len(neighbors)==0:
        return []
    contribution_value=rank_value/len(neighbors)
    return [(neighbor,contribution_value) for neighbor in neighbors]

#Adding contributions for each node
def sum_contributions(contrib1,contrib2):
    return contrib1+contrib2

#applying the damping factor(beta) to the summed contributions
def apply_damping_factor(total_contrib):
    damping=0.85
    return (total_contrib*damping)+(1-damping)/total_pages

#Formatting the final output before saving to a file
def format_rank_result(record):
    return f"{record[0]}, {record[1]}"


spark_conf=SparkConf().setAppName("PageRankCalculation").setMaster("local").set("spark.pyspark.python", "python").set("spark.pyspark.driver.python", "python")
sc=SparkContext(conf=spark_conf)

#lines_rdd=sc.textFile("question3.txt").map(lambda line:line.strip())
lines_rdd=sc.textFile("question3.txt")

#Parse each line to extract page
edges_rdd=lines_rdd.map(lambda line: (
    int(line.split(':')[0].strip()),
    [int(neighbor.strip(' []')) for neighbor in line.split(':')[1].split(',')]
))

#Counting the total number of unique pages (nodes)
total_pages=edges_rdd.count()

#Initializing the rank of each page to 1/total_pages
page_rank_initial=edges_rdd.map(lambda page: (page[0], 1.0/total_pages))

#Defining the maximum number of iterations for PageRank
max_iterations = 8
iteration_count = 0

while iteration_count < max_iterations:
    #Joining edges data with current page ranks and compute contributions
    contrib_rdd=edges_rdd.join(page_rank_initial).flatMap(compute_page_contributions)
    #Updating the page ranks using the summed contributions
    page_rank_initial=contrib_rdd.reduceByKey(sum_contributions).mapValues(apply_damping_factor)
    iteration_count += 1

#Find the node with the lowest rank
min_rank_node=page_rank_initial.min(lambda x:x[1])

#Find the node with the highest rank
max_rank_node=page_rank_initial.max(lambda x:x[1])

print("Page with the lowest rank:",min_rank_node)
print("Page with the highest rank:",max_rank_node)

formatted_rank_output=page_rank_initial.map(format_rank_result)
write_results(formatted_rank_output, 'output_page_rank.txt')

Page with the lowest rank: (31, 0.002821903291492742)
Page with the highest rank: (60, 0.02650135383327059)


In [None]:
sc.stop()