In [1]:
# Importing necessary packages

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json

import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
spark_session = SparkSession.builder\
        .master("spark://192.168.2.156:7077") \
        .appName("Group 6 project")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 32)\
        .config("spark.cores.max", 32)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.2.156:9000")\
        .getOrCreate()

sqlContext = SQLContext(spark_session.sparkContext)
spark_session.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/15 15:32:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Load JSON as a Spark DataFrame
df = spark_session.read.json("hdfs://192.168.2.156:9000/data/reddit/corpus-webis-tldr-17.json")

ERROR:root:KeyboardInterrupt while sending command.               (0 + 0) / 147]
Traceback (most recent call last):
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/ubuntu/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [14]:
df.show() #Works!

                                                                                

+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|            author|                body|             content|content_len|     id|      normalizedBody|           subreddit|subreddit_id|             summary|summary_len|   title|
+------------------+--------------------+--------------------+-----------+-------+--------------------+--------------------+------------+--------------------+-----------+--------+
|  raysofdarkmatter|I think it should...|I think it should...|        178|c69al3r|I think it should...|                math|    t5_2qh0n|Shifting seasonal...|          8|    NULL|
|           Stork13|Art is about the ...|Art is about the ...|        148|c6a9nxd|Art is about the ...|               funny|    t5_2qh33|Personal opinions...|          4|    NULL|
|     Cloud_dreamer|Ask me what I thi...|Ask me what I thi...|         76|c6acx4l|Ask me what I thi.

# Task 1: Reading level in subreddits
First, we analyse the language level using Flesch–Kincaid grade level.

In [15]:
# Making a new df for analyzing the reading level

# Preprocessing

# Splitting sentences
df_spell = df.select("subreddit", split(df["content"], r'[.!?]').alias("content_split"))

# Splitting words, removing punctuation and empty strings
df_words = df_spell.withColumn("content_split", expr("""transform(content_split, x -> filter(transform(split(x, ' '), word -> trim(regexp_replace(regexp_replace(word, '[\\n]', ''), '[,\\.\\!\\?:\\*\\(\\)]', ''))), word -> word != ''))"""))

df_words = df_words.filter(size(df_words["content_split"]) > 0)  # Remove empty rows

df_words.first()
print(f"The length is {df_words.count()}")



The length is 3848330


                                                                                

In [16]:
# Define the syllable count function
def syllable_count(word):
    # If the word is not alphabetic, return 0 (e.g., for numbers or symbols)
    if not word.isalpha():
        return 0
    
    word = word.lower()  # Ensure the word is in lowercase
    count = 0
    vowels = "aeiou"
    
    # Check the first letter
    if word[0] in vowels:
        count += 1
    
    # Count syllables in the rest of the word
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    
    # Special case: Subtract 1 if the word ends with 'e' (unless 'e' is preceded by a vowel)
    if word.endswith("e") and (len(word) > 1 and word[-2] not in vowels):
        count -= 1
    
    # Ensure that a word has at least one syllable
    if count == 0:
        count += 1
    
    # Limit reasonable syllable count (this can be adjusted based on context)
    if count < 10:  # Reasonable cap for most English words
        return count
    else:
        return 0

# Define the function to calculate the reading level (Flesch-Kincaid)
def reading_level(list_of_sentences):
    # Flatten the list of lists into a single list of words

    total_words = 0
    total_syllables = 0
    total_sentences = 0  # Since each row represents one sentence (as split by punctuation)

    # Iterate over each word in the flattened list and calculate total words and syllables
    for sentence in list_of_sentences:
        
        words_in_sentence = 0
        syllables_in_sentence = 0
        
        # Iterate over each word in the sentence
        for word in sentence:
            # Skip empty strings or unwanted characters (e.g., newlines)
            word = word.strip(",. !?:*()[]")
            if word.isalpha():  # Ensure the word is alphabetic
            words_in_sentence += 1  # Increment word count
            syllables_in_sentence += syllable_count(word)  # Add syllables for the word
                
        if words_in_sentence > 0 and words_in_sentence < 25: # Only add reasonable values
            total_words += words_in_sentence
            total_syllables += syllables_in_sentence
            total_sentences += 1

    # Avoid division by zero
    if total_words == 0 or total_sentences == 0:
        return 0

    # Calculate the Flesch-Kincaid readability index with the formula:
    # Reading Level = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
    level = 0.39*(total_words / total_sentences) + 11.8*(total_syllables / total_words) - 15.59
    return level

# # Define the UDF for reading level calculation
# def reading_level_udf(sentence_list):
#     return float(reading_level(sentence_list))

# Register the UDF
udf_reading_level = udf(lambda sentence_list: reading_level(sentence_list))

# Apply the UDF to calculate the reading level for each row
df_with_reading_level = df_words.withColumn("reading_level", udf_reading_level("content_split"))

# Show the result
df_with_reading_level.select("subreddit", "reading_level").show(truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

+---------------------+------------------+
|subreddit            |reading_level     |
+---------------------+------------------+
|math                 |6.725263157894737 |
|funny                |8.53345238095238  |
|Borderlands          |5.230000000000004 |
|gamingpc             |5.300161290322581 |
|Diablo               |6.498808026355196 |
|RedditLaqueristas    |4.662500000000001 |
|apple                |0                 |
|apple                |6.11554022988506  |
|RedditFilmsProduction|5.654348591549297 |
|AbandonedPorn        |3.165667858303543 |
|atheism              |1.785263157894736 |
|quotes               |1.8733333333333313|
|AskReddit            |7.2520000000000024|
|personalfinance      |7.143205128205128 |
|Animals              |2.4120000000000026|
|leagueoflegends      |2.0991029900332236|
|AskReddit            |6.541960784313726 |
|videos               |6.7040822644847395|
|trees                |10.441111111111113|
|Games                |9.338620689655173 |
+----------

                                                                                

In [None]:
# Show a couple of rows for debug

df_with_reading_level.show()

In [17]:
df_average_grade = df_with_reading_level.groupBy("subreddit").agg(avg("reading_level").alias("Avg_reading_level"))
df_average_grade.show()



+--------------------+------------------+
|           subreddit| Avg_reading_level|
+--------------------+------------------+
|               anime| 5.573502120644817|
|          MensRights| 5.664586099629072|
|              travel| 5.236568306859009|
|londonfootballmeetup| 3.759723514431316|
|               HPMOR| 6.007643517559738|
|     youtubecomments| 4.190449052872008|
|        SaltLakeCity| 4.787569081816676|
| UnresolvedMysteries|  5.49423359805998|
|          MLBTheShow| 4.315600725046564|
|           metro2033|6.4023427664974655|
|        marvelheroes| 4.673674815395976|
|             DRKCoin| 5.263462619218381|
|              AdPorn| 5.212461791974269|
|          costa_rica| 5.036286825136762|
|          television|  5.45124581114763|
|  Anarcho_Capitalism| 6.554010868398792|
|    fatpeoplestories|3.7711554108652505|
|       SanJoseSharks| 4.401430040561392|
|              Hawaii| 5.465975916369529|
|             wilfred|5.1613236841850965|
+--------------------+------------

                                                                                

In [18]:
# Top 10 subreddits with the worst (lowest) reading level
top_10_best = df_average_grade.orderBy("avg_reading_level", ascending=True).limit(10)

# Top 10 subreddits with the best (highest) reading level
top_10_worst = df_average_grade.orderBy("avg_reading_level", ascending=False).limit(10)

print("Top 10 best Subreddits:")
top_10_worst.select("subreddit", "avg_reading_level").show(truncate=False)

print("Top 10 worst Subreddits:")
top_10_best.select("subreddit", "avg_reading_level").show(truncate=False)

Top 10 best Subreddits:


                                                                                

+-------------------+------------------+
|subreddit          |avg_reading_level |
+-------------------+------------------+
|Webkinz            |27.597142857142853|
|PokePlazaReferences|26.490000000000006|
|SilphRoadMtnWest   |26.490000000000006|
|tacocovers         |20.980000000000008|
|faget              |20.590000000000007|
|spacecats          |20.200000000000006|
|italians           |19.27181818181818 |
|ClannadDiscussion  |16.549704163328652|
|Interlocken        |16.46166666666667 |
|reddit_court       |15.895            |
+-------------------+------------------+

Top 10 worst Subreddits:




+------------------+-------------------+
|subreddit         |avg_reading_level  |
+------------------+-------------------+
|cryptoparadise    |-15.200000000000001|
|Voting            |-15.2              |
|VideoLinkBot      |-15.2              |
|theunexplained    |-15.2              |
|RedditPersonality |-15.2              |
|CIRCLEJERKMILITIA |-15.2              |
|EXRBOAHRMOID      |-15.2              |
|sixthworldproblems|-14.810847212165097|
|spookyspider      |-14.067014954742227|
|newsokunomoral    |-13.25             |
+------------------+-------------------+



                                                                                

In [None]:
pdf = df_with_reading_level.select("reading_level").limit(5000).toPandas()

# Plot histogram
plt.hist(pdf["reading_level"], bins=10, edgecolor="black")
plt.xlabel("Reading Level")
plt.ylabel("Frequency")
plt.title("Histogram of Reading Levels")
plt.show()

In [None]:

# Plot histogram
plt.hist(pdf["reading_level"], bins=30, edgecolor="black")
plt.xlabel("Reading Level")
plt.ylabel("Frequency")
plt.title("Histogram of Reading Levels")
plt.show()

In [19]:
spark_session.stop()