In [1]:
# Install necessary libraries if you haven't already (uncomment the line below if needed)
# !pip install pandas langdetect
import pandas as pd
import re
from langdetect import detect, DetectorFactory, LangDetectException
import os
import numpy as np

# Set seed for reproducibility in langdetect
DetectorFactory.seed = 42

# --- FILE PATHS ---
# Input file from Step 2
INPUT_FILE = '../data/raw_multilingual_data.csv'
# Output file for Step 3
OUTPUT_FILE = '../data/cleaned_multilingual_data.csv'

# Define the target languages (ISO 639-1 codes)
TARGET_LANGS = ['en', 'de', 'hi', 'ar']

print(f"Loading data from: {INPUT_FILE}")

# Load the raw data
try:
    df = pd.read_csv(INPUT_FILE, encoding='utf-8')
    print(f"Data loaded successfully. Total initial records: {len(df)}")
except FileNotFoundError:
    print(f"❌ Error: Input file not found at {INPUT_FILE}. Please ensure Step 2 ran correctly.")
    df = pd.DataFrame() # Create an empty DataFrame to prevent errors

Loading data from: ../data/raw_multilingual_data.csv
Data loaded successfully. Total initial records: 2995


In [2]:
def clean_text(text):
    """
    Cleans text by removing URLs, mentions, specific symbols, and extra whitespace.
    """
    if not isinstance(text, str):
        return ""
    
    # Remove URLs (http(s)://...)
    text = re.sub(r'http\S+|www.\S+', '', text)
    # Remove user mentions (@user)
    text = re.sub(r'@\w+', '', text)
    # Remove specific Reddit remnants (e.g., selftext placeholder)
    text = re.sub(r'\[removed\]|\[deleted\]', '', text, flags=re.IGNORECASE)
    # Remove emojis (basic handling)
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # geometric shapes
        "\U0001F800-\U0001F8FF"  # supplemental arrows-C
        "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        "\U0001FA00-\U0001FA6F"  # chess symbols
        "\U0001FA70-\U0001FAFF"  # supplemental symbols and pictographs
        "\U00002702-\U000027B0"  # Dingbats
        "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    # Replace non-alphanumeric characters with spaces, but keep full stops/periods and some punctuation for context
    text = re.sub(r'[^\w\s.?!-]', ' ', text)
    # Collapse multiple spaces into a single space
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Filter out posts that are too short after cleaning
    if len(text) < 20: 
        return "" 
        
    return text

def safe_lang_detect(text):
    """
    Attempts to detect the language, returns 'unknown' on failure or if not in target list.
    """
    # Skip detection for empty strings
    if not text:
        return np.nan
        
    try:
        # Detect the language
        lang = detect(text)
        
        # Check if the detected language is one of our targets
        if lang in TARGET_LANGS:
            return lang
        else:
            return 'other'
    except LangDetectException:
        # Handle cases where the text is too short or too ambiguous for detection
        return 'undetermined'

In [3]:
if not df.empty:
    print("\n--- Applying Cleaning and Language Detection ---")
    
    # 1. Apply Text Cleaning
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Remove rows where cleaning resulted in an empty string
    df = df[df['cleaned_text'] != ""].reset_index(drop=True)
    print(f"Records after initial cleaning (minimum length): {len(df)}")
    
    # 2. Apply Language Detection
    df['detected_language'] = df['cleaned_text'].apply(safe_lang_detect)
    
    # 3. Filter Data
    # Keep only records where the detected language is one of the target languages
    filtered_df = df[df['detected_language'].isin(TARGET_LANGS)].copy()
    
    # 4. Final Cleanup: Select and rename final columns
    final_columns = ['cleaned_text', 'detected_language', 'raw_timestamp', 'source_type', 'source_name']
    filtered_df = filtered_df[final_columns]
    
    # Rename columns for clarity in the next step
    filtered_df.rename(columns={'cleaned_text': 'text', 'detected_language': 'language'}, inplace=True)
    
    print(f"\n--- Filtering Summary ---")
    print(f"Total records remaining after filtering: {len(filtered_df)}")
    
    # Display language counts
    print("\nFinal Language Distribution:")
    print(filtered_df['language'].value_counts())
    
    # 5. Save the final preprocessed dataset
    filtered_df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')
    print("\n" + "="*50)
    print("           ✅ Preprocessing Complete! ✅")
    print("="*50)
    print(f"Cleaned data saved to: {os.path.abspath(OUTPUT_FILE)}")
    print("\nReady for Step 4: Model Implementation (Sentiment and Emotion Analysis).")
else:
    print("\nCannot proceed with Step 3 as the initial DataFrame is empty.")


--- Applying Cleaning and Language Detection ---
Records after initial cleaning (minimum length): 2837

--- Filtering Summary ---
Total records remaining after filtering: 2792

Final Language Distribution:
language
en    1704
de     701
ar     377
hi      10
Name: count, dtype: int64

           ✅ Preprocessing Complete! ✅
Cleaned data saved to: c:\Users\Sulav\OneDrive\Desktop\MultiLingualSentimentAnalysis\data\cleaned_multilingual_data.csv

Ready for Step 4: Model Implementation (Sentiment and Emotion Analysis).
