In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [0]:
dataset_is_batched = False

try:
    dbutils.widgets.dropdown(
        "dataset_is_batched",
        "false",
        ["true", "false"],
        "Batch Processing"
    )
    dataset_is_batched = dbutils.widgets.get("dataset_is_batched").lower() == "true"
except Exception:
    # Non-Databricks environments land here
    pass

dataset_is_batched

False

### Querying the Gold Table Source

In [0]:
if dataset_is_batched:
    spark.sql("""
        select * from parquet.`/Volumes/sentimental_analysis/raw/ml_output_batch/batch_0`
        limit 5
    """).display()
else:
    spark.sql("""
        select * from parquet.`/Volumes/sentimental_analysis/raw/ml_output_single`
        limit 5
    """).display()

row_id,cleaned_text,created_date,sentiment_label,emotion_label
2,Republican take evening leader week season event program fish Mrs increase common. Such large parent candidate.,2024-08-01,positive,neutral
3,Guess job miss more eat example can first war base economy approach require.,2024-08-01,positive,neutral
4,Security audit went smoothly.,2024-08-01,negative,neutral
8,Born there way design public represent grow too sort. Former lot line number discussion.,2024-08-01,negative,neutral
11,Now however ready she thousand rule any activity.,2024-08-01,negative,neutral
12,Zero breaches this quarter! Become necessary two deal.,2024-08-01,negative,neutral
13,So item father discussion focus middle until manager event eat garden item participant.,2024-08-01,positive,neutral
14,Big amount drive fast nation offer. Name car area morning tax window.,2024-08-01,negative,neutral
15,Encryption standards are top-notch. Because group sure even establish.,2024-08-01,negative,neutral
16,Reviewing email_system logs. Send explain economic mouth coach record speak.,2024-08-01,negative,neutral


### Creating the Gold Table "social_media_gold"

In [0]:
SINGLE_SOURCE_PATH = "/Volumes/sentimental_analysis/raw/ml_output_single"
BATCH_SOURCE_PATH  = "/Volumes/sentimental_analysis/raw/ml_output_batch/"

TARGET_TABLE = "sentimental_analysis.gold.social_media_gold"


if dataset_is_batched:
    silver_df = spark.read.table('sentimental_analysis.silver.social_media_silver')
    df = spark.read.format("parquet").option("recursiveFileLookup", "true").load(BATCH_SOURCE_PATH)

    df = silver_df.join(ml_df.select('row_id', 'sentiment_label', 'emotion_label'), on="row_id", how="left")

else:
    silver_df = spark.read.table('sentimental_analysis.silver.social_media_silver')
    ml_df = spark.read.format("parquet").load(SINGLE_SOURCE_PATH)
    
    df = silver_df.join(ml_df.select('row_id', 'sentiment_label', 'emotion_label'), on="row_id", how="left")

df = df.withColumn("ingest_time", current_timestamp())

gold_df = df.select(
    'row_id',
    'id',
    'text',
    'cleaned_text',
    'hashtags_array',
    'mentions_array',
    'has_hashtags',
    'has_mentions',
    'word_count',
    'created_at',
    'created_date',
    'username',
    'user_id',
    'user_verified',
    'user_location',
    'language',
    'retweet_count',
    'like_count',
    'reply_count',
    'quote_count',
    'impression_count',
    'user_followers_count',
    'user_following_count',
    'urls',
    'media_urls',
    'source',
    'is_retweet',
    'is_reply',
    'in_reply_to_user_id',
    'conversation_id',
    'attack_type',
    'delivery_method',
    'context_target',
    'sentiment_label',
    'emotion_label',
    'ingest_time'
    )
    
gold_df.orderBy(col("row_id")).write.format("delta").mode("overwrite").option("delta.enableChangeDataFeed", "true").saveAsTable(TARGET_TABLE)

### Querying the Gold Table

In [0]:
%sql
select * from sentimental_analysis.gold.social_media_gold
limit 5

row_id,id,text,cleaned_text,hashtags_array,mentions_array,has_hashtags,has_mentions,word_count,created_at,created_date,username,user_id,user_verified,user_location,language,retweet_count,like_count,reply_count,quote_count,impression_count,user_followers_count,user_following_count,urls,media_urls,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,attack_type,delivery_method,context_target,sentiment_label,emotion_label,ingest_time
266069,3607128149269399014,Our SOC team responded fast. Whatever current picture appear wrong break education. #DDoS @osmith,Our SOC team responded fast. Whatever current picture appear wrong break education.,List(DDoS),List(osmith),True,True,12,2024-12-06T19:29:52.000Z,2024-12-06,@jonesjeffrey,147574081,True,New Kenneth,en,4,10,2,0,55,1215,1027,unknown,unknown,Twitter for iPhone,False,False,,1813278695336276183,Ransomware,malware,server,negative,neutral,2025-12-29T09:34:45.425Z
266070,9375998079231693596,Malware vulnerability exploited. Wish if song let customer pass out. #Phishing #DDoS @ffreeman @vsanchez,Malware vulnerability exploited. Wish if song let customer pass out.,"List(Phishing, DDoS)","List(ffreeman, vsanchez)",True,True,10,2024-12-06T19:30:08.000Z,2024-12-06,@youngstephanie,103105215,False,unknown,en,2,8,1,1,47,16508,715,unknown,unknown,Android,False,False,,4859603086910919985,Malware,malware,firewall,positive,neutral,2025-12-29T09:34:45.425Z
266071,5739050200945625826,Add something check manager value win remain customer couple their success full call. Than direction probably data. #SOC #CVE @ijones,Add something check manager value win remain customer couple their success full call. Than direction probably data.,"List(SOC, CVE)",List(ijones),True,True,17,2024-12-06T19:31:10.000Z,2024-12-06,@jstout,244950274,False,unknown,en,4,13,1,4,49,8719,618,unknown,unknown,Twitter Web App,False,False,,2644086342591632860,unknown,unknown,unknown,negative,neutral,2025-12-29T09:34:45.425Z
266072,7124659950688309754,City analysis herself resource set knowledge south. #SOC #DDoS @mphillips,City analysis herself resource set knowledge south.,"List(SOC, DDoS)",List(mphillips),True,True,7,2024-12-06T19:31:39.000Z,2024-12-06,@nicholasgonzalez,120621096,False,unknown,en,4,8,1,0,58,19,853,https://example.com/news/96,unknown,Twitter Web App,False,False,,2367602161335097049,unknown,unknown,unknown,negative,neutral,2025-12-29T09:34:45.425Z
266073,9570317851423868765,Patch for Social Engineering vulnerability released. Own within yet none large when executive. #ZeroDay,Patch for Social Engineering vulnerability released. Own within yet none large when executive.,List(ZeroDay),List(),True,False,14,2024-12-06T19:31:42.000Z,2024-12-06,@myersleon,571374574,False,North Martin,en,6,4,0,0,47,1288,1397,unknown,unknown,TweetDeck,False,False,,1222939716131948101,Social Engineering,social_engineering,web_portal,negative,neutral,2025-12-29T09:34:45.425Z


### Gold Table Count Check

In [0]:
%sql
select count(*) from sentimental_analysis.gold.social_media_gold

count(*)
503456


### Creating a Gold Table : ML Core

In [0]:
df = spark.read.table('sentimental_analysis.gold.social_media_gold')
ml_core_df = df.select('row_id', 'cleaned_text', 'created_date', 'sentiment_label', 'emotion_label')

ml_core_df.orderBy('row_id').write.format("delta").mode("overwrite").option("delta.enableChangeDataFeed", "true").saveAsTable("sentimental_analysis.gold.gold_ml_core")

In [0]:
%sql
select * from sentimental_analysis.gold.gold_ml_core
limit 5

row_id,cleaned_text,created_date,sentiment_label,emotion_label
1,Patch for SQL Injection vulnerability released.,2024-08-01,positive,neutral
2,Republican take evening leader week season event program fish Mrs increase common. Such large parent candidate.,2024-08-01,positive,neutral
3,Guess job miss more eat example can first war base economy approach require.,2024-08-01,positive,neutral
4,Security audit went smoothly.,2024-08-01,negative,neutral
5,New ddos campaign detected.,2024-08-01,negative,neutral


### Gold Aggregate Table : Daily Sentiment Trend

In [0]:
%sql
CREATE OR REPLACE TABLE sentimental_analysis.gold.daily_sentiment_trend AS
SELECT
  created_date,
  sentiment_label,
  COUNT(*) AS tweet_count
FROM sentimental_analysis.gold.gold_ml_core
GROUP BY created_date, sentiment_label;

num_affected_rows,num_inserted_rows


In [0]:
%sql
Alter table sentimental_analysis.gold.daily_sentiment_trend set tblproperties('delta.enableChangeDataFeed' =  'true')

### Querying Gold Aggragate Table : daily_sentiment_trend

In [0]:
%sql
select * from sentimental_analysis.gold.daily_sentiment_trend
order by created_date, sentiment_label
limit 5

created_date,sentiment_label,tweet_count
2024-08-01,negative,1080
2024-08-01,positive,984
2024-08-02,negative,1104
2024-08-02,positive,927
2024-08-03,negative,1126


### Creating Gold Aggregate Table : Daily Emotion Trend

In [0]:
%sql
CREATE OR REPLACE TABLE sentimental_analysis.gold.daily_emotion_trend AS
SELECT
  created_date,
  emotion_label,
  COUNT(*) AS tweet_count
FROM sentimental_analysis.gold.gold_ml_core
GROUP BY created_date, emotion_label

num_affected_rows,num_inserted_rows


In [0]:
%sql
Alter table sentimental_analysis.gold.daily_emotion_trend set tblproperties('delta.enableChangeDataFeed' =  'true')

### Querying Gold Aggregate Table : daily_emotion_trend

In [0]:
%sql
SELECT * 
FROM sentimental_analysis.gold.daily_emotion_trend
ORDER BY created_date, emotion_label
limit 5

created_date,emotion_label,tweet_count
2024-08-01,anger,81
2024-08-01,disgust,20
2024-08-01,fear,245
2024-08-01,joy,92
2024-08-01,neutral,1537
