In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Schema For Bronze

In [0]:
bronze_schema = """
id string,
text string,
cleaned_text string,
created_at timestamp,
username string,
user_id string,
language string,
retweet_count integer,
like_count integer,
reply_count integer,
quote_count integer,
impression_count integer,
hashtags string,
mentions string,
urls string,
media_urls string,
source string,
is_retweet boolean,
is_reply boolean,
in_reply_to_user_id string,
conversation_id string,
user_followers_count integer,
user_following_count integer,
user_verified boolean,
user_location string,
possibly_sensitive boolean,
sentiment string,
sentiment_score double,
subjectivity double,
emotion string,
attack_type string,
delivery_method string,
context_target string
"""

### Querying and Displaying the data for sample

In [0]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "false").schema(bronze_schema)\
    .load("/Volumes/sentimental_analysis/raw/social_media_volume")
    
display(df.limit(5))

id,text,cleaned_text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,hashtags,mentions,urls,media_urls,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,possibly_sensitive,sentiment,sentiment_score,subjectivity,emotion,attack_type,delivery_method,context_target
1945351111358374057,Agent every development say quality throughout beautiful. #DataBreach,Agent every development say quality throughout beautiful.,2025-01-11T18:16:14.000Z,@fjohnson,958070516,en,4,5,2,0,43,#DataBreach,,,https://img.example.com/1.jpg,Twitter for iPhone,False,False,,6994125922663206833,59,1634,False,East William,False,positive,0.85,1.0,surprise,,unknown,
3562910770574305398,Night respond red information last everything. #CVE @blakeerik,Night respond red information last everything.,2024-09-07T15:01:29.000Z,@jpeterson,819895579,en,2,10,2,0,44,#CVE,@blakeerik,https://example.com/news/13,,TweetDeck,False,False,,2151318126898855557,12401,1005,False,Port Matthew,False,neutral,0.0,0.033,neutral,,unknown,
3669132812676222602,Here grow gas enough analysis least by. #InfoSec #CyberSecurity #MFA,Here grow gas enough analysis least by.,2025-03-27T10:09:59.000Z,@smiller,918490409,en,2,13,0,0,46,"#InfoSec, #CyberSecurity, #MFA",,https://example.com/news/36,,TweetDeck,False,False,,9639113705720282878,9514,1995,False,Barbaraland,False,negative,-0.15,0.45,sadness,,unknown,
7314451947335758630,Product significant world talk term herself. Player half have decide environment view possible. #MFA #CVE @amandasanchez @ogray,Product significant world talk term herself. Player half have decide environment view possible.,2024-10-18T11:26:02.000Z,@gabriellecameron,825070419,en,3,4,0,0,55,"#MFA, #CVE","@amandasanchez, @ogray",https://example.com/news/5,,Android,False,True,,7621590421612634350,1329,428,False,East Lydiamouth,False,neutral,0.069,0.681,neutral,,unknown,
4951604536009256313,Environment decision wall then fire pretty how trip learn enter east. Much section investment on gun young catch. #SOC #SOC #Phishing @ddavis @hernandezernest,Environment decision wall then fire pretty how trip learn enter east. Much section investment on gun young catch.,2025-03-06T06:33:19.000Z,@ycarlson,428953029,en,1,6,3,0,55,"#SOC, #SOC, #Phishing","@ddavis, @hernandezernest",,https://img.example.com/4.jpg,Twitter Web App,False,False,171879360.0,7276331413508044428,28089,920,False,Carlsonmouth,False,positive,0.183,0.533,joy,,unknown,


### Dropping the Unnecessary Column

In [0]:
df = df.drop('cleaned_text', 'hashtags', 'mentions', 'possibly_sensitive', 'sentiment', 'sentiment_score', 'subjectivity', 'emotion')

### Creating a Bronze Table

In [0]:
df = df.withColumn('ingest_time', current_timestamp())
df.write.mode("overwrite").format("delta").option("delta.enableChangeDataFeed", "true").saveAsTable('sentimental_analysis.bronze.social_media_bronze')

### Querying the Bronze Table

In [0]:
%sql
select * from sentimental_analysis.bronze.social_media_bronze
limit 5

id,text,created_at,username,user_id,language,retweet_count,like_count,reply_count,quote_count,impression_count,urls,media_urls,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,user_followers_count,user_following_count,user_verified,user_location,attack_type,delivery_method,context_target,ingest_time
6756104069413467992,Phishing scam stole customer data. Else dark require let own father itself. #Ransomware @taylortimothy,2025-01-25T07:22:25.000Z,@barajaslucas,26048909,fr,2,7,0,2,47,,,Android,False,False,,7969315299031362774,17585,1377,False,New Kevinport,Phishing,social_engineering,database,2025-12-26T18:16:01.467Z
4698209966954012889,Administration company require beautiful happen authority whom. #Phishing @christopherhudson @lisabrooks,2024-12-31T06:39:35.000Z,@obarnes,249928258,en,5,4,1,0,45,https://example.com/news/62,https://img.example.com/5.jpg,TweetDeck,False,True,,9211089253166534054,9534,920,False,South Garyville,,unknown,,2025-12-26T18:16:01.467Z
3409203360084917437,Indeed bank budget find parent listen head door operation deal professor. #DataBreach @robertbruce @erica16,2025-02-25T22:00:54.000Z,@courtneyfox,832428340,es,3,10,1,2,45,,https://img.example.com/4.jpg,Android,False,False,,8938789069609500589,11194,100,False,,,unknown,,2025-12-26T18:16:01.467Z
5509752392425025109,Rate lead on form while national arrive best travel. #ThreatIntel #MFA #DataBreach @knightanne @karathompson,2024-08-17T03:08:47.000Z,@elizabeth72,45483312,fr,1,5,1,1,44,,,TweetDeck,False,False,,2309839474879821148,27874,1024,False,,,unknown,,2025-12-26T18:16:01.467Z
1038570393285133048,Great server update! Specific have maintain yard sister ok guy analysis. #Ransomware @owilliams,2024-12-28T22:15:04.000Z,@wevans,305345870,en,4,10,2,1,49,,,Twitter for iPhone,False,False,,2423854976071670313,2691,671,False,Adamburgh,Brute Force,credential,server,2025-12-26T18:16:01.467Z
