In [1]:
import re
import pandas as pd

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)       # removes URLs
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)       # removes mentions
    text = re.sub(r"#", "", text)                    # removes hashtags
    text = re.sub(r"[^a-zA-Z\u0900-\u097F!?'\s]", " ", text)  # keeps English + Hindi letters
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [3]:
# preprocesses scraped file
df = pd.read_csv(r"C:\MAIN\Projects\Sarcasm Detection\Data - web scraping for validation\data\reddit_code_mixed_posts.csv")
df["clean_text"] = df["combined_text"].apply(clean_text)
df.to_csv("reddit_code_mixed_cleaned.csv", index=False, encoding="utf-8")

print("✅ Cleaned file saved as reddit_code_mixed_cleaned.csv")
print(df[["subreddit", "clean_text"]].head())

✅ Cleaned file saved as reddit_code_mixed_cleaned.csv
  subreddit                                         clean_text
0     india  ask india thread welcome to r india's ask indi...
1     india  mental emotional health support thread welcome...
2     india  just got scammed for like the title suggests i...
3     india  i found a mk bag on goa airport you will be su...
4     india  'never procured a drop of milk' how uttarakhan...


In [5]:
import joblib
clf = joblib.load("best_baseline_model.pkl")
df["predicted_label"] = clf.predict(df["clean_text"])
df.to_csv("reddit_code_mixed_labeled.csv", index=False)
print("✅ Sarcasm predictions saved to reddit_code_mixed_labeled.csv")

✅ Sarcasm predictions saved to reddit_code_mixed_labeled.csv
