In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark import StorageLevel

In [0]:
is_batched = False

try:
    dbutils.widgets.dropdown(
        "is_batched",
        "false",
        ["true", "false"],
        "Batch Processing"
    )
    is_batched = dbutils.widgets.get("is_batched").lower() == "true"
except Exception:
    # Non-Databricks environments land here
    pass

is_batched


False

### Querying the Silver Table

In [0]:
%sql
select * from sentimental_analysis.silver.social_media_silver
limit 5

row_id,id,text,cleaned_text,hashtags_array,mentions_array,has_hashtags,has_mentions,word_count,created_at,created_date,username,user_id,user_verified,user_location,language,retweet_count,like_count,reply_count,quote_count,impression_count,user_followers_count,user_following_count,urls,media_urls,source,is_retweet,is_reply,in_reply_to_user_id,conversation_id,attack_type,delivery_method,context_target,ingest_time
1,9191903636345700142,Patch for SQL Injection vulnerability released. #CVE #CVE,Patch for SQL Injection vulnerability released.,"List(CVE, CVE)",List(),True,False,7,2024-08-01T00:00:27.000Z,2024-08-01,@sbaldwin,857212553,False,unknown,en,1,14,1,1,33,2003,1626,unknown,unknown,Twitter Web App,False,False,,3692899713833546847,SQL Injection,exploit,corporate_network,2025-12-29T05:56:43.116Z
2,2768468865306412359,Republican take evening leader week season event program fish Mrs increase common. Such large parent candidate. #Firewall #ZeroDay @david39 @bakerbrent,Republican take evening leader week season event program fish Mrs increase common. Such large parent candidate.,"List(Firewall, ZeroDay)","List(david39, bakerbrent)",True,True,16,2024-08-01T00:01:32.000Z,2024-08-01,@karen16,456484389,True,Lake Mark,en,0,5,0,1,38,19990,1060,https://example.com/news/85,https://img.example.com/1.jpg,Android,False,False,,7546778863198935481,unknown,unknown,unknown,2025-12-29T05:56:43.116Z
3,6734007271730864259,Guess job miss more eat example can first war base economy approach require. #SOC @serickson,Guess job miss more eat example can first war base economy approach require.,List(SOC),List(serickson),True,True,13,2024-08-01T00:01:39.000Z,2024-08-01,@lisa16,605847713,False,South Angelica,en,1,10,0,4,47,1794,1869,unknown,unknown,TweetDeck,False,True,,9155366862291657394,unknown,unknown,unknown,2025-12-29T05:56:43.116Z
4,7825357171775709264,Security audit went smoothly. #InfoSec #CyberSecurity,Security audit went smoothly.,"List(InfoSec, CyberSecurity)",List(),True,False,5,2024-08-01T00:02:13.000Z,2024-08-01,@henry28,657894082,False,North Margarettown,en,3,9,0,1,59,22861,1967,unknown,unknown,TweetDeck,False,False,,6655308818574900253,Man-in-the-Middle,exploit,corporate_network,2025-12-29T05:56:43.116Z
5,1489034316026594924,New ddos campaign detected. #CyberSecurity #MFA #ThreatIntel,New ddos campaign detected.,"List(CyberSecurity, MFA, ThreatIntel)",List(),True,False,5,2024-08-01T00:02:45.000Z,2024-08-01,@brittany66,819843095,False,Lake Megan,en,2,9,0,0,55,835,106,unknown,unknown,Android,False,False,,8282303825448412450,DDoS,exploit,web_portal,2025-12-29T05:56:43.116Z


### Checking Min date and Max Date of the Silver Table

In [0]:
%sql
select min(created_at) as start_date, max(created_at) as end_date, count(*) as total_rows  from sentimental_analysis.silver.social_media_silver

start_date,end_date,total_rows
2024-08-01T00:00:27.000Z,2025-03-30T23:59:56.000Z,503456


### Creating a DF from the Silver Table

In [0]:
df = spark.read.table("sentimental_analysis.silver.social_media_silver")

### It creates 100 batches or single file based on the parameter is_batched Value

In [0]:
BASE_PATH = "/Volumes/sentimental_analysis/raw"
BATCH_BASE = f"{BASE_PATH}/ml_source_batch"

NUM_PARTS = 100

def batch_path(i):
    return f"{BATCH_BASE}/batch_{i}"


def batch_completed(path):
    try:
        dbutils.fs.ls(f"{path}/_SUCCESS")
        return True
    except:
        return False

In [0]:
if is_batched:

    NUM_PARTS = 100

    window = Window.orderBy("row_id")

    df_chunked = (
        df
        .select("row_id", "cleaned_text", "created_date")
        .withColumn("rn", row_number().over(window))
        .withColumn("chunk_id", (col("rn") - 1) % NUM_PARTS)
    )

    for i in range(NUM_PARTS):

        final_path = batch_path(i)

        # Skip if already completed
        try:
            dbutils.fs.ls(f"{final_path}/_SUCCESS")
            print(f"batch_{i} already exists — skipping")
            continue
        except:
            pass

        (
            df_chunked
            .filter(col("chunk_id") == i)
            .drop("rn", "chunk_id")
            .write
            .mode("overwrite")
            .parquet(final_path)
        )

        print(f"batch_{i} completed")

    print("ml_source_batch created successfully")

else:
    # 🔥 DIRECT single-dataset write (NO temp, NO mv)

    FINAL_PATH = f"{BASE_PATH}/ml_source_single"

    (
        df
        .select("row_id", "cleaned_text", "created_date")
        .coalesce(1)   # optional — only if dataset is small
        .write
        .mode("overwrite")
        .parquet(FINAL_PATH)
    )

    print("ml_source_single created successfully")




batch_0 completed
batch_1 completed
batch_2 completed
batch_3 completed
batch_4 completed
batch_5 completed
batch_6 completed
batch_7 completed
batch_8 completed
batch_9 completed
batch_10 completed
batch_11 completed
batch_12 completed
batch_13 completed
batch_14 completed
batch_15 completed
batch_16 completed
batch_17 completed
batch_18 completed
batch_19 completed
batch_20 completed
batch_21 completed
batch_22 completed
batch_23 completed
batch_24 completed
batch_25 completed
batch_26 completed
batch_27 completed
batch_28 completed
batch_29 completed
batch_30 completed
batch_31 completed
batch_32 completed
batch_33 completed
batch_34 completed
batch_35 completed
batch_36 completed
batch_37 completed
batch_38 completed
batch_39 completed
batch_40 completed
batch_41 completed
batch_42 completed
batch_43 completed
batch_44 completed
batch_45 completed
batch_46 completed
batch_47 completed
batch_48 completed
batch_49 completed
batch_50 completed
batch_51 completed
batch_52 completed
bat

In [0]:
spark.read.format('parquet').load('/Volumes/sentimental_analysis/raw/ml_source_single').limit(5).display()

row_id,cleaned_text,created_date
1,Patch for SQL Injection vulnerability released.,2024-08-01
2,Republican take evening leader week season event program fish Mrs increase common. Such large parent candidate.,2024-08-01
3,Guess job miss more eat example can first war base economy approach require.,2024-08-01
4,Security audit went smoothly.,2024-08-01
5,New ddos campaign detected.,2024-08-01
