In [0]:
# !pip install textblob vaderSentiment scikit-learn

Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/02/07/5fd2945356dd839974d3a25de8a142dc37293c21315729a41e775b5f3569/textblob-0.18.0.post0-py3-none-any.whl.metadata
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting vaderSentiment
  Obtaining dependency information for vaderSentiment from https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/626.3 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/626.3 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/62

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from textblob import TextBlob
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml import Pipeline

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Reddit Sentiment & Emotion Analysis") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

### Define UDFs for Sentiment and Emotion Analysis
Next, define functions for sentiment polarity (using TextBlob) and emotion detection based on sentiment polarity.

In [0]:
# Define UDFs for sentiment and emotion analysis
def get_sentiment(text):
    if text:
        blob = TextBlob(text)
        return blob.sentiment.polarity
    return 0.0  # Return 0.0 for empty or null texts

def get_emotion(text):
    sentiment = get_sentiment(text)
    if sentiment > 0.1:
        return "Positive"
    elif sentiment < -0.1:
        return "Negative"
    else:
        return "Neutral"

# Register UDFs for sentiment and emotion analysis
spark.udf.register("get_sentiment", get_sentiment)
spark.udf.register("get_emotion", get_emotion)

<function __main__.get_emotion(text)>

### Load Data from Silver Layer

In [0]:
# Load data from the Silver layer (assuming it is stored in Delta format)
silver_df = spark.read.format("delta").table("big_data_analytics_v.big_data_analytics_sesssion_v.silver_reddit_posts")

### Apply Sentiment and Emotion Analy sis  on 'title' and 'description'
This part involves applying the sentiment and emotion analysis functions (get_sentiment and get_emotion) to the title and description columns of the Reddit posts.

In [0]:
# Perform sentiment and emotion analysis on 'title' and 'description'
transformed_df = silver_df.withColumn("title_polarity", F.expr("get_sentiment(title)")) \
                          .withColumn("title_emotion", F.expr("get_emotion(title)")) \
                          .withColumn("description_polarity", F.expr("get_sentiment(description)")) \
                          .withColumn("description_emotion", F.expr("get_emotion(description)"))

### TF-IDF Feature Extraction
This part involves applying the sentiment and emotion analysis functions (get_sentiment and get_emotion) to the title and description columns of the Reddit posts.

In [0]:
# Replace null values in 'title' and 'description' columns with empty strings
transformed_df = transformed_df.fillna({'title': '', 'description': ''})

# Perform TF-IDF Feature Extraction for 'title' and 'description'
tokenizer_title = Tokenizer(inputCol="title", outputCol="title_words")
tokenizer_description = Tokenizer(inputCol="description", outputCol="description_words")

hashing_tf_title = HashingTF(inputCol="title_words", outputCol="title_tfidf")
hashing_tf_description = HashingTF(inputCol="description_words", outputCol="description_tfidf")

idf_title = IDF(inputCol="title_tfidf", outputCol="title_tfidf_features")
idf_description = IDF(inputCol="description_tfidf", outputCol="description_tfidf_features")

# Create a pipeline for TF-IDF feature extraction
pipeline = Pipeline(stages=[tokenizer_title, tokenizer_description, hashing_tf_title, 
                            hashing_tf_description, idf_title, idf_description])

# Fit and transform the data to extract features
pipeline_model = pipeline.fit(transformed_df)
final_df = pipeline_model.transform(transformed_df)

# Display the final DataFrame
display(final_df)

Downloading artifacts:   0%|          | 0/45 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

post_id,title,description,subreddit,author,score,created_at,url,title_polarity,title_emotion,description_polarity,description_emotion,title_words,description_words,title_tfidf,description_tfidf,title_tfidf_features,description_tfidf_features
1h8jsw4,What has Major to do with Aviation ?,"Koi sharam hoti hey, koi haya hoti hey",pakistan,DanishJaved,566,2024-12-07T03:43:29Z,https://i.redd.it/56xdl83elc5e1.jpeg,0.0625,Neutral,0.0,Neutral,"List(what, has, major, to, do, with, aviation, ?)","List(koi, sharam, hoti, hey,, koi, haya, hoti, hey)","Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 32869, 81566, 115994, 126466, 132786, 133107, 151058), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(36821, 61756, 92093, 158962, 159989, 193464), values -> List(1.0, 1.0, 2.0, 2.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 32869, 81566, 115994, 126466, 132786, 133107, 151058), values -> List(1.5686159179138452, 3.1780538303479458, 2.2617630984737906, 2.4849066497880004, 2.772588722239781, 2.4849066497880004, 3.1780538303479458, 2.772588722239781))","Map(vectorType -> sparse, length -> 262144, indices -> List(36821, 61756, 92093, 158962, 159989, 193464), values -> List(3.1780538303479458, 2.772588722239781, 6.3561076606958915, 6.3561076606958915, 3.1780538303479458, 3.1780538303479458))"
1h8kuhq,Reality of Pakistan Stock Market,,pakistan,Nixture24,294,2024-12-07T04:44:02Z,https://v.redd.it/zl16ene5wc5e1,0.0,Neutral,0.0,Neutral,"List(reality, of, pakistan, stock, market)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(8145, 18697, 51576, 97454, 219087), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(8145, 18697, 51576, 97454, 219087), values -> List(3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 1.4733057381095203, 1.4733057381095203))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8pj8y,"مولوی طاہر اشرفی کے والد کی ""ايمان افروز"" تحریر",ہمارے مولویوں نے کوئی کسر نہیں چھوڑی,pakistan,desolatoration,92,2024-12-07T10:12:26Z,https://i.redd.it/8s6wtecsie5e1.jpeg,0.0,Neutral,0.0,Neutral,"List(مولوی, طاہر, اشرفی, کے, والد, کی, ""ايمان, افروز"", تحریر)","List(ہمارے, مولویوں, نے, کوئی, کسر, نہیں, چھوڑی)","Map(vectorType -> sparse, length -> 262144, indices -> List(34365, 92443, 96533, 106356, 133991, 168014, 248527, 249626, 260523), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(12316, 51476, 60101, 60711, 67217, 173925, 254370), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(34365, 92443, 96533, 106356, 133991, 168014, 248527, 249626, 260523), values -> List(3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(12316, 51476, 60101, 60711, 67217, 173925, 254370), values -> List(3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))"
1h8r4sf,Pakistan Air Force to Field Chinese J-35 Fifth Generation Fighters Before 2027 - Reports,"I have seen other few reports of Pakistan increasing its naval fleet, strengthening it's navy etc. Im not against this, pakistan is completely in its right to improve its defence. But I thought pakistan is having an economic crisis at the moment, how is it affording to do it? Or am I just out of loop and it's normal. I haven't seen something like this anywhere else.",pakistan,weebmaster696,54,2024-12-07T12:07:57Z,https://militarywatchmagazine.com/article/pakistan-j35-fifth-gen-before-2027,0.0,Neutral,0.0684523809523809,Neutral,"List(pakistan, air, force, to, field, chinese, j-35, fifth, generation, fighters, before, 2027, -, reports)","List(i, have, seen, other, few, reports, of, pakistan, increasing, its, naval, fleet,, strengthening, it's, navy, etc., im, not, against, this,, pakistan, is, completely, in, its, right, to, improve, its, defence., but, i, thought, pakistan, is, having, an, economic, crisis, at, the, moment,, how, is, it, affording, to, do, it?, or, am, i, just, out, of, loop, and, it's, normal., i, haven't, seen, something, like, this, anywhere, else.)","Map(vectorType -> sparse, length -> 262144, indices -> List(2016, 27576, 38640, 56103, 76772, 97454, 98424, 100262, 109616, 151412, 192450, 208459, 228444, 257347), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(1603, 2076, 12472, 14575, 19036, 19043, 27576, 30950, 31015, 31970, 33917, 57058, 59593, 59615, 63976, 67288, 73039, 84173, 84237, 95805, 95889, 97171, 97454, 106841, 108541, 108647, 115994, 119453, 134392, 140315, 142343, 143202, 162991, 176257, 183938, 186593, 186845, 187114, 190473, 197443, 197605, 208258, 211473, 219087, 219915, 221693, 228444, 229166, 238804, 245487, 250855, 253475, 257091, 258418), values -> List(1.0, 1.0, 1.0, 1.0, 4.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(2016, 27576, 38640, 56103, 76772, 97454, 98424, 100262, 109616, 151412, 192450, 208459, 228444, 257347), values -> List(3.1780538303479458, 1.5686159179138452, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 1.4733057381095203, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(1603, 2076, 12472, 14575, 19036, 19043, 27576, 30950, 31015, 31970, 33917, 57058, 59593, 59615, 63976, 67288, 73039, 84173, 84237, 95805, 95889, 97171, 97454, 106841, 108541, 108647, 115994, 119453, 134392, 140315, 142343, 143202, 162991, 176257, 183938, 186593, 186845, 187114, 190473, 197443, 197605, 208258, 211473, 219087, 219915, 221693, 228444, 229166, 238804, 245487, 250855, 253475, 257091, 258418), values -> List(3.1780538303479458, 3.1780538303479458, 2.772588722239781, 3.1780538303479458, 5.545177444479562, 3.1780538303479458, 2.1972245773362196, 1.3062516534463542, 3.1780538303479458, 3.1780538303479458, 1.6739764335716716, 2.772588722239781, 3.1780538303479458, 2.772588722239781, 3.1780538303479458, 2.772588722239781, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 4.1588830833596715, 1.037987666851675, 1.9252908618525775, 5.375278407684165, 4.1588830833596715, 1.3862943611198906, 2.772588722239781, 1.791759469228055, 1.4733057381095203, 3.1780538303479458, 8.317766166719343, 3.1780538303479458, 1.9252908618525775, 3.1780538303479458, 2.2617630984737906, 2.2617630984737906, 5.545177444479562, 2.4849066497880004, 2.2617630984737906, 3.1780538303479458, 1.791759469228055, 3.1780538303479458, 1.791759469228055, 3.1780538303479458, 2.1972245773362196, 1.0986122886681098, 1.9252908618525775, 3.1780538303479458, 2.4849066497880004, 3.1780538303479458, 3.1780538303479458, 1.2321436812926323, 1.791759469228055, 3.1780538303479458, 3.1780538303479458))"
1h8r6bg,Protest against AJK ordinance intensifies,,pakistan,Nixture24,344,2024-12-07T12:10:46Z,https://v.redd.it/597jgx2v3f5e1,0.0,Neutral,0.0,Neutral,"List(protest, against, ajk, ordinance, intensifies)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(59593, 62407, 99590, 185729, 209034), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(59593, 62407, 99590, 185729, 209034), values -> List(2.772588722239781, 2.772588722239781, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8so0o,BBC News report on British Mirpuris,,pakistan,Ok-Affect-5198,126,2024-12-07T13:38:07Z,https://v.redd.it/6eyn7qadjf5e1,0.0,Neutral,0.0,Neutral,"List(bbc, news, report, on, british, mirpuris)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(49185, 67416, 73241, 88966, 95005, 216618), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(49185, 67416, 73241, 88966, 95005, 216618), values -> List(3.1780538303479458, 1.9252908618525775, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8vq0m,"Early morning snaps from Model town park, Lahore",,pakistan,hopefull420,427,2024-12-07T16:10:30Z,https://www.reddit.com/gallery/1h8vq0m,0.1,Neutral,0.0,Neutral,"List(early, morning, snaps, from, model, town, park,, lahore)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(99179, 101169, 111910, 131672, 165595, 187436, 199020, 208158), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(99179, 101169, 111910, 131672, 165595, 187436, 199020, 208158), values -> List(3.1780538303479458, 2.2617630984737906, 3.1780538303479458, 3.1780538303479458, 2.4849066497880004, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8wjtq,Peshawar near to SUNSET ☀️,,pakistan,Mann_009,15,2024-12-07T16:48:25Z,https://i.redd.it/kbaxdvb9hg5e1.jpeg,0.1,Neutral,0.0,Neutral,"List(peshawar, near, to, sunset, ☀️)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 54806, 139370, 141613, 150278), values -> List(1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 54806, 139370, 141613, 150278), values -> List(1.5686159179138452, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8xg9x,Even Snakes can't survive Punjab polss,,pakistan,Logical-Mail3534,14,2024-12-07T17:28:46Z,https://i.redd.it/bqi2dufkog5e1.jpeg,0.0,Neutral,0.0,Neutral,"List(even, snakes, can't, survive, punjab, polss)",List(),"Map(vectorType -> sparse, length -> 262144, indices -> List(77396, 142577, 174966, 207897, 216445, 217594), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(77396, 142577, 174966, 207897, 216445, 217594), values -> List(3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(249180), values -> List(0.31585294941847725))"
1h8y1yb,Beautiful kid on Islamabad streets selling flowers 🥺😭,Please try to help them ( ik we should not entertain beggars ) but 30-100rps won't do you any harm.,pakistan,Business_Ad_8925,462,2024-12-07T17:56:36Z,https://i.redd.it/01zd5viltg5e1.png,0.85,Positive,0.0,Neutral,"List(beautiful, kid, on, islamabad, streets, selling, flowers, 🥺😭)","List(please, try, to, help, them, (, ik, we, should, not, entertain, beggars, ), but, 30-100rps, won't, do, you, any, harm.)","Map(vectorType -> sparse, length -> 262144, indices -> List(5451, 62124, 67416, 84657, 137582, 211325, 218806, 259145), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 33917, 34116, 49013, 77053, 88493, 92566, 115994, 124612, 137423, 150039, 156084, 166368, 167150, 170996, 196996, 213605, 214962, 221693, 239859), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> sparse, length -> 262144, indices -> List(5451, 62124, 67416, 84657, 137582, 211325, 218806, 259145), values -> List(3.1780538303479458, 3.1780538303479458, 1.9252908618525775, 2.772588722239781, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458))","Map(vectorType -> sparse, length -> 262144, indices -> List(27576, 33917, 34116, 49013, 77053, 88493, 92566, 115994, 124612, 137423, 150039, 156084, 166368, 167150, 170996, 196996, 213605, 214962, 221693, 239859), values -> List(1.0986122886681098, 1.6739764335716716, 2.772588722239781, 2.4849066497880004, 2.4849066497880004, 3.1780538303479458, 3.1780538303479458, 1.791759469228055, 3.1780538303479458, 3.1780538303479458, 3.1780538303479458, 2.4849066497880004, 2.772588722239781, 3.1780538303479458, 2.4849066497880004, 3.1780538303479458, 2.772588722239781, 1.4733057381095203, 1.9252908618525775, 2.772588722239781))"


### Topic Modeling
Topic modeling can help discover the main themes in your title and description text. We will use Latent Dirichlet Allocation (LDA) for topic modeling.

In [0]:
from pyspark.ml.clustering import LDA
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

# If 'combined_features' already exists, remove it
if 'combined_features' in final_df.columns:
    final_df = final_df.drop('combined_features')

# Combine title and description TF-IDF features
assembler = VectorAssembler(
    inputCols=["title_tfidf_features", "description_tfidf_features"],
    outputCol="combined_features"
)
final_df = assembler.transform(final_df)

# Apply LDA for topic modeling (k=5 topics with default settings)
lda = LDA(
    k=2,  # Set number of topics
    maxIter=1,  # Set number of iterations
    featuresCol="combined_features",  # Use combined features
    topicDistributionCol="topicDistribution"  # Assign topic distributions
)
lda_model = lda.fit(final_df)
topics_df = lda_model.transform(final_df)

# Show the top topics and associated data
display(topics_df.select("topicDistribution", "title", "description"))

topicDistribution,title,description
"Map(vectorType -> dense, length -> 2, values -> List(0.942437969557363, 0.05756203044263703))",What has Major to do with Aviation ?,"Koi sharam hoti hey, koi haya hoti hey"
"Map(vectorType -> dense, length -> 2, values -> List(0.9365000033020255, 0.06349999669797453))",Reality of Pakistan Stock Market,
"Map(vectorType -> dense, length -> 2, values -> List(0.02735413443636068, 0.9726458655636394))","مولوی طاہر اشرفی کے والد کی ""ايمان افروز"" تحریر",ہمارے مولویوں نے کوئی کسر نہیں چھوڑی
"Map(vectorType -> dense, length -> 2, values -> List(0.01161275829008726, 0.9883872417099128))",Pakistan Air Force to Field Chinese J-35 Fifth Generation Fighters Before 2027 - Reports,"I have seen other few reports of Pakistan increasing its naval fleet, strengthening it's navy etc. Im not against this, pakistan is completely in its right to improve its defence. But I thought pakistan is having an economic crisis at the moment, how is it affording to do it? Or am I just out of loop and it's normal. I haven't seen something like this anywhere else."
"Map(vectorType -> dense, length -> 2, values -> List(0.08271783154087572, 0.9172821684591242))",Protest against AJK ordinance intensifies,
"Map(vectorType -> dense, length -> 2, values -> List(0.9483695943421115, 0.05163040565788851))",BBC News report on British Mirpuris,
"Map(vectorType -> dense, length -> 2, values -> List(0.04480150435931044, 0.9551984956406895))","Early morning snaps from Model town park, Lahore",
"Map(vectorType -> dense, length -> 2, values -> List(0.9235194111522739, 0.07648058884772614))",Peshawar near to SUNSET ☀️,
"Map(vectorType -> dense, length -> 2, values -> List(0.05558366613235218, 0.9444163338676478))",Even Snakes can't survive Punjab polss,
"Map(vectorType -> dense, length -> 2, values -> List(0.9771885220328391, 0.022811477967160963))",Beautiful kid on Islamabad streets selling flowers 🥺😭,Please try to help them ( ik we should not entertain beggars ) but 30-100rps won't do you any harm.


### Sentiment Trends Visualization

In [0]:
# Extract date or month for trend analysis
final_df = final_df.withColumn("date", F.to_date(final_df["created_at"]))

# Optionally, if you want to analyze by month
final_df = final_df.withColumn("month", F.date_format(final_df["created_at"], "yyyy-MM"))

# Optionally, if you want to analyze by day of the week
final_df = final_df.withColumn("day_of_week", F.dayofweek(final_df["created_at"]))

# Group by date or month and calculate average sentiment polarity
sentiment_trends_df = final_df.groupBy("month").agg(
    F.avg("title_polarity").alias("avg_title_polarity"),
    F.avg("description_polarity").alias("avg_description_polarity")
)

# Display the aggregated sentiment data
sentiment_trends_df.show()

+-------+-------------------+------------------------+
|  month| avg_title_polarity|avg_description_polarity|
+-------+-------------------+------------------------+
|2024-12|0.03953900709219858|    0.004020307399161646|
+-------+-------------------+------------------------+



In [0]:
import plotly.express as px

# Assuming sentiment_trends_df is already calculated as shown in your code above
sentiment_trends_pandas = sentiment_trends_df.toPandas()

# Create an interactive plot for sentiment trends
fig = px.line(sentiment_trends_pandas, x='month', y=['avg_title_polarity', 'avg_description_polarity'],
              labels={'month': 'Month', 'value': 'Average Sentiment Polarity'},
              title='Sentiment Trends Over Time')

# Add markers for better visualization
fig.update_traces(mode='lines+markers')

# Customize layout for better appearance
fig.update_layout(
    xaxis_title="Month",
    yaxis_title="Average Sentiment Polarity",
    title="Sentiment Trends Over Time",
    template="plotly_dark",  # Optional: Use a dark theme
    autosize=True
)

# Show the interactive plot
fig.show()

Exception ignored in: <function JavaWrapper.__del__ at 0x7f94d796fc40>
Traceback (most recent call last):
  File "/databricks/spark/python/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
                                              ^^^^^^^^^^^^^^
AttributeError: 'LDA' object has no attribute '_java_obj'


### Save the Final Data

In [0]:
from pyspark.sql.functions import col

# Flatten the STRUCT columns and select only the 'values' field from 'title_tfidf_features' and 'description_tfidf_features'
final_df_transformed = final_df.select(
    "post_id",
    "title_polarity",
    "title_emotion",
    "description_polarity",
    "description_emotion",
    col("title_tfidf_features.values").alias("title_tfidf_values"),
    col("description_tfidf_features.values").alias("description_tfidf_values"),
    "features",
    "combined_features"
)

# Create a temporary view of the transformed DataFrame
final_df_transformed.createOrReplaceTempView("ml_reddit_posts_temp")

# Insert data into the table using Spark SQL
spark.sql("""
    INSERT INTO big_data_analytics_v.big_data_analytics_sesssion_v.ml_reddit_posts
    SELECT 
        post_id,
        title_polarity,
        title_emotion,
        description_polarity,
        description_emotion,
        title_tfidf_values,
        description_tfidf_values,
        features,
        combined_features
    FROM ml_reddit_posts_temp
""")

# Log the completion of the insert process
print("ML-related data inserted into the table successfully!")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-3056731227740120>, line 4[0m
[1;32m      1[0m [38;5;28;01mfrom[39;00m [38;5;21;01mpyspark[39;00m[38;5;21;01m.[39;00m[38;5;21;01msql[39;00m[38;5;21;01m.[39;00m[38;5;21;01mfunctions[39;00m [38;5;28;01mimport[39;00m col
[1;32m      3[0m [38;5;66;03m# Flatten the STRUCT columns and select only the 'values' field from 'title_tfidf_features' and 'description_tfidf_features'[39;00m
[0;32m----> 4[0m final_df_transformed [38;5;241m=[39m final_df[38;5;241m.[39mselect(
[1;32m      5[0m     [38;5;124m"[39m[38;5;124mpost_id[39m[38;5;124m"[39m,
[1;32m      6[0m     [38;5;124m"[39m[38;5;124mtitle_polarity[39m[38;5;124m"[39m,
[1;32m      7[0m     [38;5;124m"[39m[38;5;124mtitle_emotion[39m[38;5;124m"[39m,
[1;32m      8[0m     [38;5;124m"[39m[38;5;124mdescr