In [13]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Author_Influence_Network") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

spark


In [14]:
from pyspark.sql.functions import input_file_name

books_df = spark.read.text("/home/prapti/CSL7110_Assignment/Hadoop/Dataset/*.txt") \
    .withColumn("file_name", input_file_name()) \
    .withColumnRenamed("value", "text")

books_df.count()


                                                                                

4119082

In [15]:
from pyspark.sql.functions import collect_list, concat_ws

books_full = books_df.groupBy("file_name") \
    .agg(concat_ws("\n", collect_list("text")).alias("text"))

books_full.count()


                                                                                

425

In [16]:
from pyspark.sql.functions import regexp_extract

books_meta = books_full \
    .withColumn("author", regexp_extract("text", r"Author:\s*(.*)", 1)) \
    .withColumn("release_date", regexp_extract("text", r"Release Date:\s*(.*)", 1)) \
    .withColumn("year", regexp_extract("release_date", r"(\d{4})", 1))

books_meta.select("author", "year").show(5, truncate=False)




+--------------------+----+
|author              |year|
+--------------------+----+
|William Morris      |2008|
|Unknown             |2008|
|Gene Stratton Porter|1995|
|Virginia Woolf      |2006|
|Plato               |2008|
+--------------------+----+
only showing top 5 rows



                                                                                

In [17]:
books_clean = books_meta.filter(
    (books_meta.author != "") & (books_meta.year != "")
)

books_clean.count()


                                                                                

407

In [18]:
from pyspark.sql.functions import col

books_clean = books_clean.withColumn("year", col("year").cast("int"))

books_clean.select("author", "year").show(5)




+--------------------+----+
|              author|year|
+--------------------+----+
|      William Morris|2008|
|             Unknown|2008|
|Gene Stratton Porter|1995|
|      Virginia Woolf|2006|
|               Plato|2008|
+--------------------+----+
only showing top 5 rows



                                                                                

In [7]:
from pyspark.sql.functions import col

for X in [3, 5, 10]:
    
    a = books_clean.alias("a")
    b = books_clean.alias("b")

    edges_temp = a.join(
        b,
        (col("a.author") != col("b.author")) &
        (col("a.year") < col("b.year")) &
        ((col("b.year") - col("a.year")) <= X)
    ).select(
        col("a.author"),
        col("b.author")
    ).distinct()

    print(f"X = {X}, Edge count = {edges_temp.count()}")


                                                                                

X = 3, Edge count = 10853


                                                                                

X = 5, Edge count = 12241




X = 10, Edge count = 14460


                                                                                

This shows that increasing X results in a denser network with more potential influence relationships. A smaller X produces a more selective and sparse network, while a larger X creates broader and less restrictive influence connections.
So, now will be choosing X=5 as it provides a balancing trade off between too sparse network (fewer edges) and denser network (many edges) 

In [19]:
X = 5

In [20]:
from pyspark.sql.functions import col

a = books_clean.alias("a")
b = books_clean.alias("b")

edges = a.join(
    b,
    (col("a.author") != col("b.author")) &
    (col("a.year") < col("b.year")) &
    ((col("b.year") - col("a.year")) <= X)
).select(
    col("a.author").alias("author1"),
    col("b.author").alias("author2")
).distinct()

edges.show(5, truncate=False)





+-----------------------------------------+----------------------+
|author1                                  |author2               |
+-----------------------------------------+----------------------+
|United States Central Intelligence Agency|Robert Louis Stevenson|
|United States Central Intelligence Agency|Jerome K. Jerome      |
|Anonymous                                |Geoffrey Chaucer      |
|Anonymous                                |John Muir             |
|Anonymous                                |Eugene Field          |
+-----------------------------------------+----------------------+
only showing top 5 rows



                                                                                

In [21]:
out_degree = edges.groupBy("author1") \
    .count() \
    .withColumnRenamed("count", "out_degree")

out_degree.show(5)




+--------------------+----------+
|             author1|out_degree|
+--------------------+----------+
|     Charles Dickens|       193|
|       Bayard Taylor|         5|
|         Jane Austen|        94|
|David Graham Phil...|       134|
|         Rene Doumic|       109|
+--------------------+----------+
only showing top 5 rows



                                                                                

In [22]:
in_degree = edges.groupBy("author2") \
    .count() \
    .withColumnRenamed("count", "in_degree")

in_degree.show(5)




+--------------------+---------+
|             author2|in_degree|
+--------------------+---------+
|       Stephen Crane|      129|
|       Bayard Taylor|       51|
|         Jane Austen|       77|
|David Graham Phil...|       97|
|     Charles Dickens|       31|
+--------------------+---------+
only showing top 5 rows



                                                                                

In [23]:
out_degree.orderBy("out_degree", ascending=False).show(5, truncate=False)




+----------------------+----------+
|author1               |out_degree|
+----------------------+----------+
|Edgar Rice Burroughs  |206       |
|Charles Dickens       |193       |
|Lucy Maud Montgomery  |192       |
|Thomas Hardy          |192       |
|Robert Louis Stevenson|192       |
+----------------------+----------+
only showing top 5 rows



                                                                                

In [None]:
in_degree.orderBy("in_degree", ascending=False).show(5, truncate=False)
