In [1]:
import os
from datasets import load_dataset

out = "./data/jziebura_polish_youth_slang"
os.makedirs(out, exist_ok=True)

ds = load_dataset("jziebura/polish_youth_slang_classification")
ds["train"].to_parquet(f"{out}/train.parquet")
ds["validation"].to_parquet(f"{out}/validation.parquet")
ds["test"].to_parquet(f"{out}/test.parquet")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 4337/4337 [00:00<00:00, 291009.24 examples/s]
Generating validation split: 100%|██████████| 542/542 [00:00<00:00, 280517.37 examples/s]
Generating test split: 100%|██████████| 543/543 [00:00<00:00, 294365.65 examples/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 196.14ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1178.84ba/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1331.53ba/s]


151148

In [3]:
from datasets import Dataset

out = "./data/jziebura_polish_youth_slang"
train = Dataset.from_parquet(f"{out}/train.parquet")
valid = Dataset.from_parquet(f"{out}/validation.parquet")
test  = Dataset.from_parquet(f"{out}/test.parquet")

print("rows:", len(train), len(valid), len(test))
print("columns:", train.column_names)
print("label example:", train[0].get("sentiment"))

Generating train split: 4337 examples [00:00, 862404.42 examples/s]
Generating train split: 542 examples [00:00, 268649.58 examples/s]
Generating train split: 543 examples [00:00, 318043.16 examples/s]

rows: 4337 542 543
columns: ['słowo slangowe', 'znaczenie wyrazów slangowych', 'źródło', 'powiązana data', 'tekst', 'sentyment']
label example: None





In [5]:
from datasets import Dataset

ds = Dataset.from_parquet("./data/jziebura_polish_youth_slang/train.parquet")
print(ds.column_names)

['słowo slangowe', 'znaczenie wyrazów slangowych', 'źródło', 'powiązana data', 'tekst', 'sentyment']


In [None]:
import os
import re
from collections import Counter
from datasets import Dataset

IN_DIR = "./data/jziebura_polish_youth_slang"
OUT_DIR = "./data/jziebura_polish_youth_slang_clean"
os.makedirs(OUT_DIR, exist_ok=True)

LABEL_CANDIDATES = ["label", "sentiment", "sentymemt", "sentymemt", "sentymemt", "sentymemt"]

def load_split(name: str) -> Dataset:
    ds = Dataset.from_parquet(f"{IN_DIR}/{name}.parquet")
    return standardize(ds)

def standardize(ds: Dataset) -> Dataset:
    cols = ds.column_names

    if "tekst" in cols and "text" not in cols:
        ds = ds.rename_column("tekst", "text")

    label_col = None
    for c in cols:
        if c in LABEL_CANDIDATES:
            label_col = c
            break
    if label_col is None:
        for c in cols:
            if c.lower().startswith("sent"):
                label_col = c
                break

    if label_col is None:
        raise ValueError(f"Nie znaleziono kolumny etykiety. Dostępne kolumny: {cols}")

    if label_col != "label":
        ds = ds.rename_column(label_col, "label")

    return ds

def normalize_text(example):
    t = example["text"]
    if t is None:
        return {"text": None}
    t = re.sub(r"\s+", " ", t).strip()
    return {"text": t}

def is_valid(example):
    t = example["text"]
    y = example["label"]
    return (t is not None and t != "" and y is not None)

def show_stats(ds: Dataset, name: str):
    labels = ds["label"]
    lengths = [len(x) for x in ds["text"]]
    print(f"{name}: n={len(ds)}")
    print("  class dist:", dict(sorted(Counter(labels).items())))
    print("  text len (chars): min/mean/max =",
          min(lengths), sum(lengths)/len(lengths), max(lengths))

def clean_split(ds: Dataset, drop_duplicates: bool = True) -> Dataset:
    ds = ds.map(normalize_text)
    ds = ds.filter(is_valid)

    if drop_duplicates:
        seen = set()
        def dedup(example):
            key = (example["text"], int(example["label"]))
            if key in seen:
                return False
            seen.add(key)
            return True
        ds = ds.filter(dedup)

    return ds

splits = {}
for split in ["train", "validation", "test"]:
    ds = load_split(split)
    print("RAW columns:", ds.column_names)  # debug
    show_stats(ds, f"{split} (raw)")

    ds_clean = clean_split(ds, drop_duplicates=True)
    show_stats(ds_clean, f"{split} (clean)")

    ds_clean.to_parquet(f"{OUT_DIR}/{split}.parquet")
    splits[split] = ds_clean

print("Saved cleaned splits to:", OUT_DIR)


RAW columns: ['słowo slangowe', 'znaczenie wyrazów slangowych', 'źródło', 'powiązana data', 'text', 'label']
train (raw): n=4337
  class dist: {0: 1259, 1: 2219, 2: 859}
  text len (chars): min/mean/max = 6 89.34078856352318 958


Map: 100%|██████████| 4337/4337 [00:00<00:00, 28773.32 examples/s]
Filter: 100%|██████████| 4337/4337 [00:00<00:00, 229188.57 examples/s]
Filter: 100%|██████████| 4337/4337 [00:00<00:00, 34006.83 examples/s]


train (clean): n=4335
  class dist: {0: 1259, 1: 2217, 2: 859}
  text len (chars): min/mean/max = 5 78.9838523644752 957


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 12.64ba/s]


RAW columns: ['słowo slangowe', 'znaczenie wyrazów slangowych', 'źródło', 'powiązana data', 'text', 'label']
validation (raw): n=542
  class dist: {0: 155, 1: 273, 2: 114}
  text len (chars): min/mean/max = 13 90.12177121771218 538


Map: 100%|██████████| 542/542 [00:00<00:00, 26827.23 examples/s]
Filter: 100%|██████████| 542/542 [00:00<00:00, 131458.55 examples/s]
Filter: 100%|██████████| 542/542 [00:00<00:00, 75467.67 examples/s]

validation (clean): n=542





  class dist: {0: 155, 1: 273, 2: 114}
  text len (chars): min/mean/max = 13 79.96678966789668 537


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 95.54ba/s]


RAW columns: ['słowo slangowe', 'znaczenie wyrazów slangowych', 'źródło', 'powiązana data', 'text', 'label']
test (raw): n=543
  class dist: {0: 148, 1: 275, 2: 120}
  text len (chars): min/mean/max = 6 91.60036832412523 414


Map: 100%|██████████| 543/543 [00:00<00:00, 25334.63 examples/s]
Filter: 100%|██████████| 543/543 [00:00<00:00, 130344.36 examples/s]
Filter: 100%|██████████| 543/543 [00:00<00:00, 65099.53 examples/s]


test (clean): n=543
  class dist: {0: 148, 1: 275, 2: 120}
  text len (chars): min/mean/max = 6 81.20810313075506 413


Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 114.93ba/s]

Saved cleaned splits to: ./data/jziebura_polish_youth_slang_clean





In [9]:
df = splits["train"].to_pandas()
df.head(5)

Unnamed: 0,słowo slangowe,znaczenie wyrazów slangowych,źródło,powiązana data,text,label
0,Furta,Pogardliwe określenie na szerokiego kolegę. Za...,miejski.pl,2024-08-26 20:28:25,"- Masz może lejsy, Damian? - Mam, ale ci nie d...",0
1,rzucać buchem,handlować marihuaną,miejski.pl,2020-07-16 15:36:41,"- Siema mordo, skąd Gucio ma kase na te markow...",1
2,Chędożony,Przymiotnik używany w celu podkreśleniu emocji...,miejski.pl,25.01.2023 13:02,Gówniarz chędożony! Jak nie masz gdzie palcy w...,0
3,Jebnik,"niewyobrażalny brud, syf, bałagan. Synonim sło...",miejski.pl,2025-02-18 14:07:22,- Zaraz wszystko z blatu w kuchni wyląduje w k...,0
4,Baka!,"Po japońsku ""głupek"". Używane (często żartobli...",miejski.pl,13.04.2023 20:16,Baka!,0
