## COMBINING DATASET 

In [1]:
! pip install polars
! pip install langdetect



In [2]:
import pandas as pd

# List of CSV files
csv_files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

# Read and combine into one dataframe
df_list = [pd.read_csv(file) for file in csv_files]
big_boy = pd.concat(df_list, ignore_index=True)

# Save the combined dataset
big_boy.to_csv("BIG_BOY.csv", index=False)

print("✅ Combined dataset saved as BIG_BOY.csv")
print("Shape of BIG BOY:", big_boy.shape)


KeyboardInterrupt: 

In [None]:
import polars as pl
import re

# Assume BIGBOY is already loaded as a Polars DataFrame with a text column called "comment"
# (Replace "comment" with your actual column name)

def clean_text(text: str) -> str:
    if text is None:
        return None
    
    # ✅ Lowercase
    text = text.lower()

    # ✅ Remove punctuation, numbers, and symbols but KEEP emojis & non-English chars
    # \p{L} = letters, \p{M} = diacritics, \p{Zs} = spaces, \p{Emoji} not directly supported in regex
    # So instead: only remove ASCII punctuation/numbers
    text = re.sub(r"[0-9!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]", " ", text)

    # ✅ Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

# Apply cleaning with Polars
BIGBOY = big_boy.with_columns(
    pl.col("textOriginal").map_elements(clean_text, return_dtype=pl.Utf8).alias("cleaned_comment")
)

BIGBOY.select(["textOriginal", "cleaned_comment"]).head(10)


In [None]:
import polars as pl
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
import math

DetectorFactory.seed = 42

def detect_language(text: str) -> str:
    if not text or text.strip() == "":
        return "unknown"
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"

# Parameters
batch_size = 500_000   # adjust depending on memory/speed
n_rows = BIGBOY.height
n_batches = math.ceil(n_rows / batch_size)

results = []

for i in range(n_batches):
    start = i * batch_size
    end = min((i+1) * batch_size, n_rows)
    
    print(f"Processing batch {i+1}/{n_batches} (rows {start}–{end})...")
    
    batch = BIGBOY.slice(start, end - start)
    
    batch = batch.with_columns(
        pl.col("comment").map_elements(detect_language, return_dtype=pl.Utf8).alias("language")
    )
    
    results.append(batch)

# Combine batches
BIGBOY_lang = pl.concat(results)

# Count language frequencies
lang_counts = (
    BIGBOY_lang.groupby("language")
    .count()
    .sort("count", descending=True)
)

lang_counts.head(20)
