In [13]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.156:7077") \
        .appName("Tove_ProjectGroup35")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

25/03/07 08:04:29 WARN Utils: Service 'SparkUI' could not bind on port 8080. Attempting port 8081.
25/03/07 08:04:29 WARN Utils: Service 'SparkUI' could not bind on port 8081. Attempting port 8082.


In [37]:
# Importing the file into a DataFrame
reddit_df = spark_session.read.json("hdfs://192.168.2.156:9000/data/reddit/reddit_50k.json")

# Show the schema to understand the structure
reddit_df.printSchema()



root
 |-- _corrupt_record: string (nullable = true)
 |-- author: string (nullable = true)
 |-- body: string (nullable = true)
 |-- content: string (nullable = true)
 |-- content_len: long (nullable = true)
 |-- id: string (nullable = true)
 |-- normalizedBody: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- summary_len: long (nullable = true)
 |-- title: string (nullable = true)



                                                                                

In [41]:
from pyspark.sql.functions import lower, regexp_replace, col, when, count, sum as spark_sum

# Preprocessing
reddit_preprocessed = reddit_df.select("subreddit", "body")

# Drop all subreddits where body AND subreddit is NULL
reddit_preprocessed = reddit_preprocessed.dropna(how="all", subset=["subreddit", "body"])

# Fill missing subreddit with "unknown"
reddit_preprocessed = reddit_preprocessed.withColumn("subreddit", when(col("subreddit").isNull(), "unknown").otherwise(col("subreddit")))

# Fill missing body with an empty string
reddit_preprocessed = reddit_preprocessed.withColumn("body", when(col("body").isNull(), "").otherwise(col("body")))

# Normalizing the text: lowercase and remove punctuation
reddit_preprocessed = reddit_preprocessed.withColumn("body", lower(col("body")))
reddit_preprocessed = reddit_preprocessed.withColumn("body", regexp_replace(col("body"), "[^a-z\s]", ""))  # Remove non-alphabetic chars

# Show five samples
reddit_preprocessed.show(10)

+--------------------+--------------------+
|           subreddit|                body|
+--------------------+--------------------+
|                math|i think it should...|
|               funny|art is about the ...|
|         Borderlands|ask me what i thi...|
|            gamingpc|in mechwarrior on...|
|              Diablo|you are talking a...|
|   RedditLaqueristas|all but one of my...|
|               apple|i could give a sh...|
|               apple|so youre saying t...|
|RedditFilmsProduc...|i love this idea ...|
|       AbandonedPorn|theres an entire ...|
+--------------------+--------------------+
only showing top 10 rows



                                                                                

In [44]:
# Defining a list of the largest countries in the world
countries = ["China", "India", "United States", "Indonesia", "Pakistan", 
             "Brazil", "Nigeria", "Bangladesh", "Russia", "Mexico"]

def country_mentions_counter(df):
    for country in countries:
        df = df.withColumn(country, when(col("body").contains(country.lower()), 1).otherwise(0))

    country_mentions = df.groupBy("subreddit").sum(*countries)

    # Summarize total mentions across all subreddits
    total_mentions = country_mentions.select(
        *[spark_sum(col(f"sum({country})")).alias(country) for country in countries]
    )

    return country_mentions, total_mentions
    

In [47]:
# Execution time measurement
import time
import psutil

# Capture system resource usage before execution
cpu_before = psutil.cpu_percent(interval=None)
memory_before = psutil.virtual_memory().used / (1024 ** 3)  # Convert bytes to GB

# Start time measurement
start_time = time.time()

# Call the function
subreddit_mentions, total_mentions = country_mentions_counter(reddit_preprocessed)

# End time measurement
end_time = time.time()

# Capture system resource usage after execution
cpu_after = psutil.cpu_percent(interval=None)
memory_after = psutil.virtual_memory().used / (1024 ** 3)  # Convert bytes to GB

# Peak memory usage
memory_peak = psutil.Process().memory_info().rss / (1024 ** 3)  # Resident Set Size (RSS) in GB

# Execution statistics
execution_time = end_time - start_time
cpu_usage = cpu_after - cpu_before
memory_usage = memory_after - memory_before

# Show results
total_mentions.show()  # Show total mentions across all subreddits
print(f"Execution Time: {execution_time:.4f} seconds")
print(f"CPU Usage Change: {cpu_usage:.2f}%")
print(f"Memory Usage Change: {memory_usage:.4f} GB")
print(f"Peak Memory Usage: {memory_peak:.4f} GB")




+-----+-----+-------------+---------+--------+------+-------+----------+------+------+
|China|India|United States|Indonesia|Pakistan|Brazil|Nigeria|Bangladesh|Russia|Mexico|
+-----+-----+-------------+---------+--------+------+-------+----------+------+------+
|  332|  328|          268|       17|      62|    77|     14|        11|   333|   110|
+-----+-----+-------------+---------+--------+------+-------+----------+------+------+

Execution Time: 0.1178 seconds
CPU Usage Change: 20.30%
Memory Usage Change: -0.0000 GB
Peak Memory Usage: 0.0998 GB


                                                                                

In [35]:
# Summing mentions across all subreddits
total_mentions = reddit_df.agg(
    *(spark_sum(country).alias(country) for country in countries)
)

# Show total mentions of each country
total_mentions.show()



+-----+-----+-------------+---------+--------+------+-------+----------+------+------+
|China|India|United States|Indonesia|Pakistan|Brazil|Nigeria|Bangladesh|Russia|Mexico|
+-----+-----+-------------+---------+--------+------+-------+----------+------+------+
| 1270|  637|         1104|       91|     180|   195|     52|        28|   787|   489|
+-----+-----+-------------+---------+--------+------+-------+----------+------+------+



                                                                                

In [48]:
# Terminate session
spark_session.stop()