# XHS Training Data Preparation

Prepare XHS CSV data for model training:
- Load collected + mock datasets
- Normalize text into a model-ready field
- Keep unlabeled pool separate
- Create stratified train/val/test splits for labeled data

In [None]:
# Segment 1: Imports and paths
from pathlib import Path
import re
import pandas as pd
from sklearn.model_selection import train_test_split

BASE_DIR = Path.cwd()
COLLECTED_CSV = BASE_DIR / "xhs_scraped_data_clean.csv"
MOCK_CSV = BASE_DIR / "xhs_mock_training_data.csv"
OUT_DIR = BASE_DIR / "training_data"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Base dir: {BASE_DIR}")
print(f"Collected CSV exists: {COLLECTED_CSV.exists()}")
print(f"Mock CSV exists: {MOCK_CSV.exists()}")

In [None]:
# Segment 3: Load and unify datasets
frames = []

if COLLECTED_CSV.exists():
    collected = pd.read_csv(COLLECTED_CSV)
    collected["source_file"] = "collected"
    if "label_disruption" not in collected.columns:
        collected["label_disruption"] = None
    frames.append(collected)

if MOCK_CSV.exists():
    mock = pd.read_csv(MOCK_CSV)
    mock["source_file"] = "mock"
    frames.append(mock)

if not frames:
    raise FileNotFoundError("No input CSVs found. Generate collected/mock CSVs first.")

df = pd.concat(frames, ignore_index=True)

for col in ["category", "filename", "window_title", "scraped_at", "raw", "cleaned"]:
    if col not in df.columns:
        df[col] = ""

# Utility functions
SPECIAL_CHARS = re.compile(r"[^\w\s\u4e00-\u9fff]")

def normalize_for_model(text: str) -> str:
    text = str(text or "")
    text = text.replace("/n", " ").replace("\n", " ")
    text = SPECIAL_CHARS.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip().lower()
    return text

def safe_int_label(value):
    if pd.isna(value):
        return None
    text = str(value).strip()
    if text in {"0", "1"}:
        return int(text)
    return None

df["cleaned"] = df["cleaned"].fillna("")
df["raw"] = df["raw"].fillna("")
df["text_for_model"] = df["cleaned"].where(df["cleaned"].str.strip() != "", df["raw"])
df["text_for_model"] = df["text_for_model"].apply(normalize_for_model)
df["label_disruption"] = df["label_disruption"].apply(safe_int_label)

print(f"Total rows: {len(df)}")
print(df[["source_file", "label_disruption"]].value_counts(dropna=False))

In [None]:

# Segment 4: Separate labeled and unlabeled pools
labeled_df = df[df["label_disruption"].isin([0, 1])].copy()
unlabeled_df = df[~df["label_disruption"].isin([0, 1])].copy()

# Keep only useful training columns
train_cols = [
    "category",
    "filename",
    "window_title",
    "scraped_at",
    "source_file",
    "text_for_model",
    "label_disruption",
]

labeled_df = labeled_df[train_cols]
unlabeled_cols = [col for col in train_cols if col != "label_disruption"]
unlabeled_df = unlabeled_df[unlabeled_cols]

print(f"Labeled rows: {len(labeled_df)}")
print(f"Unlabeled rows: {len(unlabeled_df)}")
if len(labeled_df) > 0:
    print(labeled_df["label_disruption"].value_counts())

In [None]:
# Segment 5: Stratified split (70/15/15)
if len(labeled_df) < 10:
    raise ValueError("Not enough labeled rows to split reliably.")

train_df, temp_df = train_test_split(
    labeled_df,
    test_size=0.30,
    random_state=42,
    stratify=labeled_df["label_disruption"],
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["label_disruption"],
)

print(f"Train rows: {len(train_df)}")
print(f"Val rows: {len(val_df)}")
print(f"Test rows: {len(test_df)}")

In [None]:
# Segment 6: Export model-ready CSV files
train_path = OUT_DIR / "train.csv"
val_path = OUT_DIR / "val.csv"
test_path = OUT_DIR / "test.csv"
labeled_path = OUT_DIR / "labeled_full.csv"
unlabeled_path = OUT_DIR / "unlabeled_pool.csv"

train_df.to_csv(train_path, index=False, encoding="utf-8")
val_df.to_csv(val_path, index=False, encoding="utf-8")
test_df.to_csv(test_path, index=False, encoding="utf-8")
labeled_df.to_csv(labeled_path, index=False, encoding="utf-8")
unlabeled_df.to_csv(unlabeled_path, index=False, encoding="utf-8")

print("Saved training-ready datasets:")
print(f"- {train_path}")
print(f"- {val_path}")
print(f"- {test_path}")
print(f"- {labeled_path}")
print(f"- {unlabeled_path}")

In [None]:
# Segment 7: Quick QA checks
for name, part in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(f"\n[{name}]")
    print(part["label_disruption"].value_counts(normalize=True).sort_index())

print("\nSample training rows:")
display(train_df[["text_for_model", "label_disruption"]].head(5))