In [2]:
# --- 1. SETUP AND CONFIGURATION ---

# Install necessary libraries if you haven't already (uncomment the line below if needed)
# !pip install praw pandas

import praw
import pandas as pd
from datetime import datetime
import time
import random
import os

# --- PRAW CONFIGURATION (YOUR CREDENTIALS) ---
CLIENT_ID = "kDC0_tOIbEZUDh6LHJDEUA"
CLIENT_SECRET = "FA-jWQz9oFJG59z5POp4OWD6mb_6aQ"
USER_AGENT = "multilingual-sentiment-project"

# --- FILE & PATH CONFIGURATION ---
DATA_DIR = '../data/' 
OUTPUT_FILE = DATA_DIR + 'raw_multilingual_data.csv'
os.makedirs(DATA_DIR, exist_ok=True) 

# --- DATA COLLECTION PARAMETERS ---
COLLECTION_LIMIT = 500  # Max number of submissions to collect per search/subreddit


def define_user_topics():
    """
    Defines the central topic across all four languages using hardcoded values 
    for non-interactive execution.
    """
    
    print("\n--- Topic Definition ---")
    
    # Replace these hardcoded strings if you want to change the topic.
    # To use interactive input, uncomment the 'input()' lines and comment out the hardcoded lines.
    
    topic_en = "Quantum Computing"  # English topic
    # topic_en = input("Enter Topic (English): ")
    
    topic_de = "Quantencomputer"   # German topic
    # topic_de = input("Enter Topic (German): ")
    
    topic_hi = "‡§ï‡•ç‡§µ‡§æ‡§Ç‡§ü‡§Æ ‡§ï‡§Ç‡§™‡•ç‡§Ø‡•Ç‡§ü‡§ø‡§Ç‡§ó" # Hindi topic
    # topic_hi = input("Enter Topic (Hindi): ")
    
    topic_ar = "ÿßŸÑÿ≠Ÿàÿ≥ÿ®ÿ© ÿßŸÑŸÉŸÖŸàŸÖŸäÿ©"  # Arabic topic
    # topic_ar = input("Enter Topic (Arabic): ")
    
    if not all([topic_en, topic_de, topic_hi, topic_ar]):
        # This will only happen if using input() and the user enters nothing
        raise ValueError("All four topic inputs must be provided.")
    
    print(f"EN: {topic_en}\nDE: {topic_de}\nHI: {topic_hi}\nAR: {topic_ar}")
    
    # Base Subreddits (kept consistent as they are relevant general/regional subreddits)
    # The user's topic is applied via the 'queries' list.
    return {
        'en': {
            'subreddits': ['artificial', 'tech', 'science'],
            'queries': [topic_en]
        },
        'de': {
            'subreddits': ['de', 'wissenschaft', 'technik'],
            'queries': [topic_de]
        },
        'hi': {
            'subreddits': ['india', 'tech', 'scienceindia'],
            'queries': [topic_hi]
        },
        'ar': {
            'subreddits': ['arabs', 'egypt', 'saudiarabia'],
            'queries': [topic_ar]
        }
    }

# Execute topic definition
REDDIT_SOURCES = define_user_topics()


# --- 2. DATA COLLECTION FUNCTION ---

def collect_reddit_data(client_id, client_secret, user_agent, sources):
    """Authenticates PRAW and collects data using the user-defined sources."""
    print("Initializing Reddit API connection...")
    try:
        reddit = praw.Reddit(
            client_id=client_id,
            client_secret=client_secret,
            user_agent=user_agent,
        )
        reddit.read_only = True
        print("Connection successful. Starting data collection.")
    except Exception as e:
        print(f"‚ùå Error initializing PRAW: {e}. Check your credentials.")
        return pd.DataFrame()

    all_data = []
    
    for lang_code, data_sources in sources.items():
        print(f"\n--- Collecting {lang_code.upper()} Data ---")

        # --- 1. Collect from Subreddits (TOP posts) ---
        for sub_name in data_sources['subreddits']:
            try:
                subreddit = reddit.subreddit(sub_name)
                for submission in subreddit.top(time_filter="year", limit=COLLECTION_LIMIT):
                    full_text = f"{submission.title} {submission.selftext}"
                    
                    if full_text.strip():
                        all_data.append({
                            'text': full_text.strip(),
                            'language_guess': lang_code,
                            'source_type': 'Reddit_Subreddit',
                            'source_name': sub_name,
                            'raw_timestamp': submission.created_utc
                        })
                print(f"‚úÖ Collected ~{COLLECTION_LIMIT} posts from r/{sub_name}.")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not fetch r/{sub_name}: {e}")
                time.sleep(3) 

        # --- 2. Collect from General Reddit Search (using user-defined topic) ---
        for query in data_sources['queries']:
            try:
                for submission in reddit.subreddit('all').search(
                    query, sort='relevance', limit=COLLECTION_LIMIT
                ):
                    full_text = f"{submission.title} {submission.selftext}"

                    if full_text.strip():
                        all_data.append({
                            'text': full_text.strip(),
                            'language_guess': lang_code,
                            'source_type': 'Reddit_Search',
                            'source_name': f"Search: {query}",
                            'raw_timestamp': submission.created_utc
                        })
                print(f"‚úÖ Collected ~{COLLECTION_LIMIT} posts via search: '{query[:20]}...'.")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not execute search '{query}': {e}")
                time.sleep(3)
                
        # Pause to respect API rate limits
        time.sleep(random.randint(5, 10))

    return pd.DataFrame(all_data)


# --- 3. EXECUTION AND FINAL SAVE ---

# Execute the combined collection for EN, DE, HI, and AR
final_df = collect_reddit_data(CLIENT_ID, CLIENT_SECRET, USER_AGENT, REDDIT_SOURCES)

# Save the final data to the specified 'data' folder
final_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')

print("\n" + "="*50)
print("             üöÄ COLLECTION COMPLETE üöÄ")
print("="*50)
print(f"Total Collected Records: {len(final_df)}")
print(f"Data saved to: {os.path.abspath(OUTPUT_FILE)}")
print("\nüî• Ready for Step 3: Data Preprocessing (Cleaning and Language Detection)")


--- Topic Definition ---
EN: Quantum Computing
DE: Quantencomputer
HI: ‡§ï‡•ç‡§µ‡§æ‡§Ç‡§ü‡§Æ ‡§ï‡§Ç‡§™‡•ç‡§Ø‡•Ç‡§ü‡§ø‡§Ç‡§ó
AR: ÿßŸÑÿ≠Ÿàÿ≥ÿ®ÿ© ÿßŸÑŸÉŸÖŸàŸÖŸäÿ©
Initializing Reddit API connection...
Connection successful. Starting data collection.

--- Collecting EN Data ---
‚úÖ Collected ~500 posts from r/artificial.
‚úÖ Collected ~500 posts from r/tech.
‚úÖ Collected ~500 posts from r/science.
‚úÖ Collected ~500 posts via search: 'Quantum Computing...'.

--- Collecting DE Data ---
‚úÖ Collected ~500 posts from r/de.
‚úÖ Collected ~500 posts from r/wissenschaft.
‚ö†Ô∏è Could not fetch r/technik: received 403 HTTP response
‚úÖ Collected ~500 posts via search: 'Quantencomputer...'.

--- Collecting HI Data ---
‚úÖ Collected ~500 posts from r/india.
‚úÖ Collected ~500 posts from r/tech.
‚úÖ Collected ~500 posts from r/scienceindia.
‚úÖ Collected ~500 posts via search: '‡§ï‡•ç‡§µ‡§æ‡§Ç‡§ü‡§Æ ‡§ï‡§Ç‡§™‡•ç‡§Ø‡•Ç‡§ü‡§ø‡§Ç‡§ó...'.

--- Collecting AR Data ---
‚úÖ Collected ~500 posts from r/ara