In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("GutenbergAnalysis") \
    .getOrCreate()

spark


26/02/10 11:39:05 WARN Utils: Your hostname, DESKTOP-M0J6OKT resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
26/02/10 11:39:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/10 11:39:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.functions import input_file_name

books_df = spark.read.text("/home/prapti/CSL7110_Assignment/Hadoop/Dataset/*.txt") \
    .withColumn("file_name", input_file_name()) \
    .withColumnRenamed("value", "text")

books_df.count()


                                                                                

4119082

In [4]:
from pyspark.sql.functions import collect_list, concat_ws

books_full = books_df.groupBy("file_name") \
    .agg(concat_ws("\n", collect_list("text")).alias("text"))

books_full.count()


                                                                                

425

In [5]:
from pyspark.sql.functions import regexp_extract

books_meta = books_full \
    .withColumn("title", regexp_extract("text", r"Title:\s*(.*)", 1)) \
    .withColumn("release_date", regexp_extract("text", r"Release Date:\s*(.*)", 1)) \
    .withColumn("language", regexp_extract("text", r"Language:\s*(.*)", 1)) \
    .withColumn("encoding", regexp_extract("text", r"Character set encoding:\s*(.*)", 1))

books_meta.select("file_name", "title", "release_date", "language", "encoding").show(5, truncate=False)


                                                                                

+-------------------------------------------------------------+---------------------------+------------------------+--------+--------+
|file_name                                                    |title                      |release_date            |language|encoding|
+-------------------------------------------------------------+---------------------------+------------------------+--------+--------+
|file:///home/prapti/CSL7110_Assignment/Hadoop/Dataset/22.txt |Roget's Thesaurus          |December, 1991          |English |ASCII   |
|file:///home/prapti/CSL7110_Assignment/Hadoop/Dataset/351.txt|Of Human Bondage           |May 6, 2008 [EBook #351]|English |ASCII   |
|file:///home/prapti/CSL7110_Assignment/Hadoop/Dataset/87.txt |The 1993 CIA World Factbook|October, 1993           |English |ASCII   |
|file:///home/prapti/CSL7110_Assignment/Hadoop/Dataset/349.txt|The Harvester              |October, 1995           |English |ASCII   |
|file:///home/prapti/CSL7110_Assignment/Hadoop/Dataset/

In [6]:
from pyspark.sql.functions import regexp_extract

books_meta = books_meta.withColumn(
    "year",
    regexp_extract("release_date", r"(\d{4})", 1)
)

books_meta.select("release_date", "year").show(5, truncate=False)




+------------------------+----+
|release_date            |year|
+------------------------+----+
|December, 1991          |1991|
|May 6, 2008 [EBook #351]|2008|
|October, 1993           |1993|
|October, 1995           |1995|
|March, 1995             |1995|
+------------------------+----+
only showing top 5 rows



                                                                                

In [7]:
books_meta.groupBy("year") \
    .count() \
    .orderBy("year") \
    .show(10)




+----+-----+
|year|count|
+----+-----+
|    |   17|
|1975|    1|
|1978|    1|
|1979|    1|
|1991|    7|
|1992|   19|
|1993|   13|
|1994|   17|
|1995|   60|
|1996|   53|
+----+-----+
only showing top 10 rows



                                                                                

In [8]:
books_meta.groupBy("language") \
    .count() \
    .orderBy("count", ascending=False) \
    .show()




+--------+-----+
|language|count|
+--------+-----+
| English|  404|
|        |   15|
|   Latin|    6|
+--------+-----+



                                                                                

In [9]:
from pyspark.sql.functions import length, avg

books_meta.select(avg(length("title")).alias("average_title_length")).show()




+--------------------+
|average_title_length|
+--------------------+
|  22.023529411764706|
+--------------------+



                                                                                