In [0]:
pip install NRCLex

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, udf, when, count, avg, max, min, row_number
from pyspark.sql.types import StringType, FloatType, IntegerType
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from textblob import TextBlob
import dlt

# Create Spark Session
spark = SparkSession.builder \
    .appName("Enhanced Reddit Gold Layer") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

In [0]:
# --- Sentiment Analysis Functions ---
def calculate_sentiment_polarity(text):
    if text:
        blob = TextBlob(text)
        return blob.sentiment.polarity
    return 0.0

def classify_sentiment(polarity):
    if polarity > 0.5:
        return "Very Positive"
    elif polarity > 0.1:
        return "Positive"
    elif polarity < -0.5:
        return "Very Negative"
    elif polarity < -0.1:
        return "Negative"
    else:
        return "Neutral"
    
# Register UDFs
udf_calculate_sentiment_polarity = udf(calculate_sentiment_polarity, FloatType())
udf_classify_sentiment = udf(classify_sentiment, StringType())

In [0]:
@dlt.table(
    name="gold_reddit_posts",
    comment="Gold Layer with enhanced Reddit post data"
)
def gold_reddit_posts():
    # Read the transformed silver layer
    transformed_df = dlt.read("silver_reddit_posts")
    
    # Add sentiment columns
    transformed_df = transformed_df.withColumn(
        "title_polarity", udf_calculate_sentiment_polarity(F.col("title"))
    ).withColumn(
        "title_sentiment", udf_classify_sentiment(F.col("title_polarity"))
    ).withColumn(
        "description_polarity", udf_calculate_sentiment_polarity(F.col("description"))
    ).withColumn(
        "description_sentiment", udf_classify_sentiment(F.col("description_polarity"))
    )
    
    # Select only the columns you need
    gold_df = transformed_df.select(
        "post_id",
        "title",
        "description",
        "score",
        "created_at",
        "title_polarity",
        "title_sentiment",
        "description_polarity",
        "description_sentiment"
    )
    
    return gold_df