In [2]:
# phase2_preprocessing.py
import os
import re
import pandas as pd
import emoji
from langdetect import detect, DetectorFactory

# Deterministic language detection
DetectorFactory.seed = 0

# ==========================
# Utility Functions
# ==========================
URL_RE = re.compile(r'https?://\S+|www\.\S+')
MENTION_RE = re.compile(r'@\w+')
HASHTAG_RE = re.compile(r'#(\w+)')
MULTI_SPACE = re.compile(r'\s+')

def clean_text(text: str) -> str:
    """Clean tweets: remove URLs, mentions, hashtags, emojis, quotes, newlines."""
    if not isinstance(text, str):
        return ""
    text = URL_RE.sub('', text)
    text = MENTION_RE.sub('', text)
    text = HASHTAG_RE.sub(r'\1', text)   # keep hashtag word
    text = text.replace('\n', ' ').replace('\r', ' ')
    text = text.replace('"', "'")        # replace double quotes inside text
    text = emoji.replace_emoji(text, replace='')
    text = MULTI_SPACE.sub(' ', text)
    return text.strip()

def is_english(text: str) -> bool:
    """Check if text is English."""
    try:
        return detect(text) == 'en'
    except:
        return False

# Known traffic-related places
PLACES = ["Vellore", "Katpadi", "CMC", "Arcot", "NH48", "Bagayam", "Gandhinagar", "Bengaluru"]

def infer_location(text: str, user_loc: str) -> str:
    """Infer location from tweet text or user profile."""
    combined = f"{text} {user_loc or ''}"
    for place in PLACES:
        if re.search(rf"\b{re.escape(place)}\b", combined, re.IGNORECASE):
            return place
    return "Unknown"

# ==========================
# Preprocessing Pipeline
# ==========================
def preprocess_phase2(input_csv: str, output_csv: str):
    print(f"üìÇ Loading dataset: {input_csv}")
    df = pd.read_csv(input_csv, dtype=str, on_bad_lines="skip")

    print("üßπ Cleaning text...")
    df["raw_text_tweet"] = df["raw_text_tweet"].apply(clean_text)
    df["user_location"] = df["user_location"].fillna("").apply(clean_text)

    print("üóëÔ∏è Dropping duplicates & missing rows...")
    df = df.drop_duplicates(subset=["tweet_id"])
    df = df.dropna(subset=["tweet_id", "raw_text_tweet"]).reset_index(drop=True)

    print("‚è≥ Parsing timestamps...")
    df["created_at"] = pd.to_datetime(
        df["created_at"], errors="coerce", format="%d %b %Y %I:%M:%S %p"
    )

    print("üåê Filtering English tweets...")
    df = df[df["raw_text_tweet"].apply(is_english)].reset_index(drop=True)

    print("üìç Inferring locations...")
    df["location_inferred"] = df.apply(
        lambda row: infer_location(row["raw_text_tweet"], row["user_location"]),
        axis=1,
    )

    print("üî¢ Converting counts to integers...")
    df["retweet_count"] = pd.to_numeric(df.get("retweet_count", 0), errors="coerce").fillna(0).astype(int)
    df["like_count"] = pd.to_numeric(df.get("like_count", 0), errors="coerce").fillna(0).astype(int)

    # Final structured dataset
    df_final = df[
        ["tweet_id", "created_at", "raw_text_tweet", "location_inferred", "retweet_count", "like_count"]
    ].rename(columns={"raw_text_tweet": "clean_text"})

    os.makedirs(os.path.dirname(output_csv), exist_ok=True)
    df_final.to_csv(output_csv, index=False, encoding="utf-8", quoting=1, escapechar="\\")

    print(f"‚úÖ Phase 2 complete. Clean dataset saved at {output_csv}")
    print("üîé Sample rows:")
    print(df_final.head())

# ==========================
# Run
# ==========================
if __name__ == "__main__":
    raw_file = "traffic_tweets.csv"
    clean_file = "traffic_tweets_clean.csv"
    preprocess_phase2(raw_file, clean_file)


üìÇ Loading dataset: traffic_tweets.csv
üßπ Cleaning text...
üóëÔ∏è Dropping duplicates & missing rows...
‚è≥ Parsing timestamps...
üåê Filtering English tweets...
üìç Inferring locations...
üî¢ Converting counts to integers...


FileNotFoundError: [WinError 3] The system cannot find the path specified: ''