In [1]:
!pip install praw pandas


Collecting praw
  Downloading praw-7.8.1-py3-none-any.whl.metadata (9.4 kB)
Collecting prawcore<3,>=2.4 (from praw)
  Downloading prawcore-2.4.0-py3-none-any.whl.metadata (5.0 kB)
Collecting update_checker>=0.18 (from praw)
  Downloading update_checker-0.18.0-py3-none-any.whl.metadata (2.3 kB)
Downloading praw-7.8.1-py3-none-any.whl (189 kB)
   ---------------------------------------- 0.0/189.3 kB ? eta -:--:--
   ---------------------------------------- 189.3/189.3 kB 5.8 MB/s eta 0:00:00
Downloading prawcore-2.4.0-py3-none-any.whl (17 kB)
Downloading update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Installing collected packages: update_checker, prawcore, praw
Successfully installed praw-7.8.1 prawcore-2.4.0 update_checker-0.18.0


In [2]:
"""
Reddit Code-Mixed Data Fetcher (SarcasmLens)
--------------------------------------------
Fetches posts from Indian subreddits that commonly use English + Hindi text.
Saves all fetched content into a CSV file and displays a sample DataFrame per subreddit.

Requirements:
    pip install praw pandas
"""

import praw
import pandas as pd
import os

# ======================================================
# 1. Reddit API credentials (from user's Reddit developer app)
# ======================================================
reddit = praw.Reddit(
    client_id="0Zp33BrXT9ez9Ri2gyC8lg",              # from Reddit app
    client_secret="MbuwnN0VBVabcdi-uAPqfp70aJJcAw",  # from Reddit app
    user_agent="SarcasmLensApp by u/Naive_Violinist_7239"  # Preet's reddit username
)

In [3]:
# ======================================================
# 2. Subreddits and limits
# ======================================================
subreddits = ["india", "IndianDankMemes", "desiHumor", "BollywoodMemes"]
limit_per_sub = 100  # number of posts to fetch per subreddit

In [4]:
# ======================================================
# 3. Fetches posts
# ======================================================
all_posts = []

for sub in subreddits:
    print(f"\nFetching from r/{sub} ...")
    posts = []
    subreddit = reddit.subreddit(sub)

    for submission in subreddit.hot(limit=limit_per_sub):
        title = submission.title or ""
        body = submission.selftext or ""
        text = (title + " " + body).strip()
        if text:
            posts.append({
                "subreddit": sub,
                "title": title,
                "body": body,
                "combined_text": text
            })

    if not posts:
        print(f"No posts fetched from r/{sub}.")
        continue

    # Displays a sample (first few rows)
    sample_df = pd.DataFrame(posts)
    print(f"\nSample data from r/{sub}:")
    print(sample_df.head(3)[["title", "combined_text"]])
    print(f"Total posts fetched: {len(sample_df)}")

    # Append to global list
    all_posts.extend(posts)


Fetching from r/india ...

Sample data from r/india:
                                      title  \
0                          Ask India Thread   
1  Mental & Emotional Health Support Thread   
2               Just got scammed for 45,000   

                                       combined_text  
0  Ask India Thread Welcome to r/India's Ask Indi...  
1  Mental & Emotional Health Support Thread Welco...  
2  Just got scammed for 45,000 Like the title sug...  
Total posts fetched: 100

Fetching from r/IndianDankMemes ...

Sample data from r/IndianDankMemes:
                            title                   combined_text
0             Stop this pollution             Stop this pollution
1                 Bachao bachao ðŸ˜°                 Bachao bachao ðŸ˜°
2  Manipulating 15yos hell naahhh  Manipulating 15yos hell naahhh
Total posts fetched: 100

Fetching from r/desiHumor ...

Sample data from r/desiHumor:
                                         title  \
0  Happy Cakeday, r/Desihumor! 

In [5]:
# ======================================================
# 4. Combines all subreddits and saves
# ======================================================
if not all_posts:
    print("\nNo posts fetched. Please check credentials or subreddit names.")
else:
    df_all = pd.DataFrame(all_posts)
    os.makedirs("data", exist_ok=True)
    output_path = os.path.join("data", "reddit_code_mixed_posts.csv")
    df_all.to_csv(output_path, index=False, encoding="utf-8")
    print(f"\nâœ… All posts saved to: {output_path}")
    print(f"Total combined posts: {len(df_all)}")


âœ… All posts saved to: data\reddit_code_mixed_posts.csv
Total combined posts: 384


In [6]:
# ======================================================
# 5. Saves raw text file
# ======================================================
text_output = os.path.join("data", "reddit_code_mixed_posts.txt")
df_all["combined_text"].to_csv(text_output, index=False, header=False, encoding="utf-8")
print(f"âœ… Text-only content saved to: {text_output}")


âœ… Text-only content saved to: data\reddit_code_mixed_posts.txt
