In [2]:
# 1) Install & setup (run once)
!pip install -q tqdm regex nltk
# If you want lemmatization via spaCy (optional; heavier):
# !pip install -q spacy
# !python -m spacy download en_core_web_sm

import os
import re
import unicodedata
import pandas as pd
from tqdm.auto import tqdm
import nltk
from nltk.corpus import stopwords
from multiprocessing import Pool, cpu_count
tqdm.pandas()
nltk.download('stopwords')
import spacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
# Path to input and output inside Google Drive
INPUT_CSV = "/content/drive/MyDrive/Data Folder/all_extracted.csv"
OUTPUT_CSV = "/content/drive/MyDrive/Data Folder/processed_text.csv"


In [10]:
import os, re, string, pandas as pd
from tqdm import tqdm
nltk.download('punkt_tab')

PROCESS_IN_CHUNKS = True   # True = safer for very large files, False = load entire CSV at once
CHUNKSIZE = 10000         # used only when PROCESS_IN_CHUNKS = True
TEXT_COLUMN = None        # set to your text column name if known (e.g., "text"), else auto-detect
MIN_TOKEN_LEN = 2

# --------------- NLTK: try to enable lemmatization -------------
use_nltk = True
try:
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    # ensure common resources (silent if already present)
    for pkg in ("punkt", "stopwords", "wordnet", "omw-1.4"):
        try:
            if pkg == "punkt":
                nltk.data.find("tokenizers/punkt")
            elif pkg == "omw-1.4":
                nltk.data.find("corpora/omw-1.4")
            else:
                nltk.data.find(f"corpora/{pkg}")
        except LookupError:
            try:
                nltk.download(pkg, quiet=True)
            except Exception:
                pass

    # load resources (may still fail if offline)
    STOPWORDS = set(w.lower() for w in stopwords.words("english"))
    TOKENIZER = word_tokenize
    LEMMATIZER = WordNetLemmatizer()
except Exception:
    use_nltk = False
    # fallback simple tokenizer & stopwords
    def simple_tokenize(s): return s.split()
    TOKENIZER = simple_tokenize
    STOPWORDS = {
        "i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves",
        "he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their",
        "theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were",
        "be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or",
        "because","as","until","while","of","at","by","for","with","about","against","between","into","through",
        "during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again",
        "further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most",
        "other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just",
        "don","should","now"
    }
    try:
        from nltk.stem import WordNetLemmatizer
        LEMMATIZER = WordNetLemmatizer()
    except Exception:
        LEMMATIZER = None
# -------------------------------------------------------------

# ------------------ cleaning (keeps your simple style) ----------
def clean_text(text):
    """Replace emails/urls, remove punctuation, lowercase (keeps numbers as is)."""
    if text is None:
        return ""
    s = str(text)
    s = re.sub(r'\S+@\S+', 'emailaddress', s)
    s = re.sub(r'https?://\S+|www\.\S+', 'url', s)
    # Remove digits substitution line, keep numbers as is
    # s = re.sub(r'\d+', 'number', s)  <-- REMOVE THIS LINE
    s = s.translate(str.maketrans('', '', string.punctuation))
    s = s.lower()
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def preprocess_text(text):
    """Tokenize -> (skip stopwords removal) -> lemmatize if available -> join"""
    txt = clean_text(text)
    if not txt:
        return ""
    tokens = TOKENIZER(txt)
    tokens = [t.lower() for t in tokens if isinstance(t, str)]
    # Skip stopwords removal step altogether (remove the filtering line)
    # tokens = [t for t in tokens if len(t) >= MIN_TOKEN_LEN and t not in STOPWORDS]
    tokens = [t for t in tokens if len(t) >= MIN_TOKEN_LEN]
    # lemmatize if available and nltk resources loaded
    if LEMMATIZER is not None and use_nltk:
        try:
            tokens = [LEMMATIZER.lemmatize(t) for t in tokens]
        except Exception:
            pass
    return " ".join(tokens)


# ----------------- column detection (simple) ---------------------
def detect_text_column(df):
    # common names first
    for c in ("text", "content", "message", "body", "tweet", "review"):
        if c in df.columns:
            return c
    # otherwise choose object dtype column with highest avg length
    obj_cols = [c for c in df.columns if df[c].dtype == "object"]
    if not obj_cols:
        return None
    best, best_len = None, -1
    for c in obj_cols:
        try:
            avg = df[c].dropna().astype(str).map(len).mean()
            if avg > best_len:
                best_len = avg; best = c
        except Exception:
            continue
    return best
# ----------------------------------------------------------------

# -------------- processing helpers -------------------------------
def process_csv_in_chunks(input_path, output_path, text_col=None, chunksize=CHUNKSIZE):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input CSV not found: {input_path}")
    if os.path.exists(output_path):
        os.remove(output_path)
    reader = pd.read_csv(input_path, chunksize=chunksize, iterator=True, dtype=str, low_memory=False)
    first = True
    total = 0
    for chunk in tqdm(reader, desc="Processing (chunks)"):
        total += len(chunk)
        if text_col is None:
            text_col = detect_text_column(chunk)
        if text_col is None:
            raise ValueError("Could not detect text column. Set TEXT_COLUMN manually.")
        if text_col not in chunk.columns:
            raise ValueError(f"Text column '{text_col}' not found in chunk.")
        chunk[text_col + "_processed"] = chunk[text_col].fillna("").astype(str).map(preprocess_text)
        if first:
            chunk.to_csv(output_path, index=False, mode="w")
            first = False
        else:
            chunk.to_csv(output_path, index=False, header=False, mode="a")
    return total, text_col

def process_whole_file(input_path, output_path, text_col=None):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input CSV not found: {input_path}")
    df = pd.read_csv(input_path, dtype=str, low_memory=False)
    if text_col is None:
        text_col = detect_text_column(df)
    if text_col is None:
        raise ValueError("Could not detect text column. Set TEXT_COLUMN manually.")
    if text_col not in df.columns:
        raise ValueError(f"Text column '{text_col}' not found in CSV.")
    df[text_col + "_processed"] = df[text_col].fillna("").astype(str).map(preprocess_text)
    df.to_csv(output_path, index=False)
    return len(df), text_col
# ----------------------------------------------------------------

# ----------------------- RUN ------------------------------------
print("INPUT:", INPUT_CSV)
print("OUTPUT:", OUTPUT_CSV)
print("PROCESS_IN_CHUNKS:", PROCESS_IN_CHUNKS)
try:
    if PROCESS_IN_CHUNKS:
        total_rows, used_col = process_csv_in_chunks(INPUT_CSV, OUTPUT_CSV, text_col=TEXT_COLUMN, chunksize=CHUNKSIZE)
    else:
        total_rows, used_col = process_whole_file(INPUT_CSV, OUTPUT_CSV, text_col=TEXT_COLUMN)
    print(f"Done. Rows processed: {total_rows}. Text column used: '{used_col}'.")
    if not use_nltk:
        print("Note: NLTK not available or resources missing — used fallback tokenization/stopwords. Lemmatization may be skipped.")
    elif LEMMATIZER is None:
        print("Note: Lemmatizer not available; lemmatization skipped.")
    print("Saved processed CSV to:", OUTPUT_CSV)
except Exception as e:
    print("Processing failed:", str(e))
# ----------------------------------------------------------------


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


INPUT: /content/drive/MyDrive/Data Folder/all_extracted.csv
OUTPUT: /content/drive/MyDrive/Data Folder/processed_text.csv
PROCESS_IN_CHUNKS: True


Processing (chunks): 2it [00:17,  8.55s/it]

Done. Rows processed: 14235. Text column used: 'text'.
Saved processed CSV to: /content/drive/MyDrive/Data Folder/processed_text.csv



