## COMBINING DATASET 

In [4]:
! pip install polars
! pip install langdetect




In [7]:
import polars as pl
import re

# --- Step 1: Read and combine CSV files with Polars ---
csv_files = [
    "comments1.csv",
    "comments2.csv",
    "comments3.csv",
    "comments4.csv",
    "comments5.csv"
]

df_list = [pl.read_csv(file) for file in csv_files]
BIGBOY = pl.concat(df_list)

# Save combined dataset
BIGBOY.write_csv("BIG_BOY.csv")
print("✅ Combined dataset saved as BIG_BOY.csv")
print("Shape of BIGBOY:", BIGBOY.shape)

# --- Step 2: Text cleaning (but keep emojis & foreign languages) ---
def clean_text(text: str) -> str:
    if text is None:
        return None
    
    # Lowercase
    text = text.lower()

    # Remove ASCII punctuation & numbers, keep emojis/foreign chars
    text = re.sub(r"[0-9!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]", " ", text)

    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

BIGBOY = BIGBOY.with_columns(
    pl.col("textOriginal").map_elements(clean_text, return_dtype=pl.Utf8).alias("cleaned_comment")
)

print("✅ Text cleaning done")
print(BIGBOY.select(["textOriginal", "cleaned_comment"]).head(10))


✅ Combined dataset saved as BIG_BOY.csv
Shape of BIGBOY: (4725012, 10)
✅ Text cleaning done
shape: (10, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ textOriginal                    ┆ cleaned_comment                 │
│ ---                             ┆ ---                             │
│ str                             ┆ str                             │
╞═════════════════════════════════╪═════════════════════════════════╡
│ PLEASE LESBIAN FLAG I BEG YOU … ┆ please lesbian flag i beg you … │
│ Apply mashed potato juice and … ┆ apply mashed potato juice and … │
│ 69 missed calls from mars👽     ┆ missed calls from mars👽        │
│ Baaa                            ┆ baaa                            │
│ you look like raven from pheno… ┆ you look like raven from pheno… │
│ American                        ┆ american                        │
│ Sahi disha me ja ja raha india… ┆ sahi disha me ja ja raha india… │
│ ❤❤❤❤❤❤❤❤❤❤                      ┆ ❤❤❤❤❤❤❤❤❤❤         