# About

Simple baselines for GhostWriter dataset (e.g. linear regression).

In [237]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [238]:
!pip install xgboost lightgbm



In [269]:
import re, math, string, numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from datasets import DatasetDict, load_dataset
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.utils import check_random_state

rng = check_random_state(42)

In [240]:
# Helpers
_punct_tbl = str.maketrans("", "", "")
punct_set = set(string.punctuation)
url_pat = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
sent_pat = re.compile(r"[.!?]")
nonspace_pat = re.compile(r"\S")
alnum_pat = re.compile(r"[A-Za-z0-9]")

def safe_div(a, b):
    return a / b if b else 0.0

def tokenize_whitespace_strip_punct(text):
    toks = []
    for tok in text.split():
        # strip leading/trailing punctuation
        tok = tok.strip(string.punctuation)
        if tok and alnum_pat.search(tok):
            toks.append(tok)
    return toks

def extract_metrics_batch(batch):
    texts = batch["text"]
    out = {
        "n_chars": [],
        "n_chars_nospace": [],
        "n_words": [],
        "avg_word_len": [],
        "n_sents": [],
        "n_punct": [],
        "punct_ratio": [],
        "upper_ratio": [],
        "digit_ratio": [],
        "url_count": [],
        "type_token_ratio": [],
        "hapax_ratio": [],
    }
    for t in texts:
        t = t if isinstance(t, str) else ""
        n_chars = len(t)
        n_chars_nospace = len(re.findall(r"\S", t))
        n_punct = sum(1 for ch in t if (not ch.isalnum()) and (not ch.isspace()))
        n_upper = sum(1 for ch in t if ch.isupper())
        n_digit = sum(1 for ch in t if ch.isdigit())
        url_count = len(url_pat.findall(t))

        # sentence count (min 1 for non-empty text)
        sent_splits = [s for s in sent_pat.split(t) if s.strip()]
        n_sents = max(1, len(sent_splits)) if t.strip() else 0

        toks = tokenize_whitespace_strip_punct(t)
        n_words = len(toks)
        avg_word_len = safe_div(sum(len(w) for w in toks), n_words)

        # lexical diversity
        toks_lower = [w.lower() for w in toks]
        vocab = set(toks_lower)
        type_token_ratio = safe_div(len(vocab), n_words)

        # hapax legomena ratio
        from collections import Counter
        cnt = Counter(toks_lower)
        hapax_ratio = safe_div(sum(1 for w, c in cnt.items() if c == 1), n_words)

        punct_ratio = safe_div(n_punct, n_chars_nospace)
        upper_ratio = safe_div(n_upper, n_chars_nospace)
        digit_ratio = safe_div(n_digit, n_chars_nospace)

        out["n_chars"].append(n_chars)
        out["n_chars_nospace"].append(n_chars_nospace)
        out["n_words"].append(n_words)
        out["avg_word_len"].append(avg_word_len)
        out["n_sents"].append(n_sents)
        out["n_punct"].append(n_punct)
        out["punct_ratio"].append(punct_ratio)
        out["upper_ratio"].append(upper_ratio)
        out["digit_ratio"].append(digit_ratio)
        out["url_count"].append(url_count)
        out["type_token_ratio"].append(type_token_ratio)
        out["hapax_ratio"].append(hapax_ratio)

    return out

In [241]:
FEATURES = [
    "n_chars","n_chars_nospace","n_words","avg_word_len","n_sents","n_punct",
    "punct_ratio","upper_ratio","digit_ratio","url_count","type_token_ratio","hapax_ratio",
]

## Dataset & Feature Extraction

In [242]:
dataset = load_dataset("TheItCrOw/GhostWriter")

In [None]:
# 1. Keep only 1 and 0
dataset01 = DatasetDict({
    split: ds.filter(lambda ex: ex["label"] in (0, 1))
    for split, ds in dataset.items()
})

# 2. Remove empty texts
dataset01 = DatasetDict({
    split: ds.filter(lambda ex: isinstance(ex["text"], str) and ex["text"].strip() != "")
    for split, ds in dataset01.items()
})

def clean_markdown(batch):
    cleaned = []
    for t in batch["text"]:
        # Collapse multiple spaces and trim
        t = re.sub(r"\s+", " ", t).strip()
        cleaned.append(t)
    return {"text": cleaned}

dataset01 = DatasetDict({
    split: ds.map(clean_markdown, batched=True, num_proc=4)
    for split, ds in dataset01.items()
})


Filter:   0%|          | 0/382535 [00:00<?, ? examples/s]

Filter:   0%|          | 0/54648 [00:00<?, ? examples/s]

Filter:   0%|          | 0/109296 [00:00<?, ? examples/s]

Filter:   0%|          | 0/256280 [00:00<?, ? examples/s]

Filter:   0%|          | 0/36611 [00:00<?, ? examples/s]

Filter:   0%|          | 0/73223 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/256266 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/36610 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/73219 [00:00<?, ? examples/s]

In [261]:
# ==== 2) Feature extraction ====
dataset_feats = DatasetDict({
    split: ds.map(extract_metrics_batch, batched=True, num_proc=8)
    for split, ds in dataset01.items()
})
dataset_feats['train'][0]

Map (num_proc=8):   0%|          | 0/256266 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/36610 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/73219 [00:00<?, ? examples/s]

{'id': '41a1cb62-9edd-4aa2-8083-e26165fcc9c2',
 'text': 'In today’s world, technology has woven itself into the very fabric of our daily lives, and its impact on understanding human emotions is both fascinating and concerning. As someone who has grown up with smartphones and social media, I can’t help but wonder how this tech shapes our ability to recognize genuine feelings, especially in educational settings. For instance, tools that analyze facial expressions or tone of voice can be incredibly useful for teachers trying to gauge student engagement. However, I often feel that relying too heavily on these technologies can lead to misunderstandings. Emotions are complex and nuanced; a smile doesn’t always mean happiness, and a frown doesn’t always signify sadness. By placing too much trust in algorithms, we risk oversimplifying the rich tapestry of human emotion. I believe that while technology can aid our understanding, it shouldn’t replace our instinctual ability to connect with other

In [262]:
# ==== 3) Prepare arrays ====
def to_xy(ds):
    X = np.column_stack([np.array(ds[f], dtype=np.float32) for f in FEATURES])
    y = np.array(ds["label"], dtype=np.int64)
    return X, y

In [263]:
X_train, y_train = to_xy(dataset_feats["train"])
X_test,  y_test  = to_xy(dataset_feats["test"])

print("Train size:", X_train.shape, " Test size:", X_test.shape)
print("Class balance (train):", {c:int((y_train==c).sum()) for c in [0,1]})
print("Class balance (test):",  {c:int((y_test==c).sum()) for c in [0,1]})

Train size: (256266, 12)  Test size: (73219, 12)
Class balance (train): {0: 104303, 1: 151963}
Class balance (test): {0: 29801, 1: 43418}


## Models

In [271]:
def compute_metrics(y_true, y_pred, y_proba):
    f1 = f1_score(y_true, y_pred)
    auroc = roc_auc_score(y_true, y_proba)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    fpr = fp / (fp + tn) if (fp + tn) else 0.0
    tpr = tp / (tp + fn) if (tp + fn) else 0.0

    return {"F1": f1, "AUROC": auroc, "FPR": fpr, "TPR": tpr}

In [272]:
results = {}

In [273]:
# 1) Linear Regression + threshold 0.5
linreg = Pipeline([
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("lr", LinearRegression())
])
linreg.fit(X_train, y_train)
linreg_scores = linreg.predict(X_test)
linreg_proba = np.clip(linreg_scores, 0, 1)
linreg_pred = (linreg_scores >= 0.5).astype(int)
results["LinearRegression"] = compute_metrics(y_test, linreg_pred, linreg_proba)

In [274]:
# 2) XGBoost
xgb = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42,
    eval_metric="logloss",
    objective="binary:logistic",
)
xgb.fit(X_train, y_train)
xgb_proba = xgb.predict_proba(X_test)[:, 1]
xgb_pred = (xgb_proba >= 0.5).astype(int)
results["XGBoost"] = compute_metrics(y_test, xgb_pred, xgb_proba)

In [275]:
# 3) LightGBM
lgbm = LGBMClassifier(
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    n_jobs=-1,
    random_state=42,
    objective="binary",
)

lgbm.fit(X_train, y_train)
lgbm_proba = lgbm.predict_proba(X_test)[:, 1]
lgbm_pred = (lgbm_proba >= 0.5).astype(int)
results["LightGBM"] = compute_metrics(y_test, lgbm_pred, lgbm_proba)

[LightGBM] [Info] Number of positive: 151963, number of negative: 104303
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2854
[LightGBM] [Info] Number of data points in the train set: 256266, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.592989 -> initscore=0.376337
[LightGBM] [Info] Start training from score 0.376337




In [277]:
from pprint import pprint
pprint(results)

{'LightGBM': {'AUROC': np.float64(0.9788015728741681),
              'F1': 0.9354409204192414,
              'FPR': np.float64(0.11258011476124963),
              'TPR': np.float64(0.9466120042378737)},
 'LinearRegression': {'AUROC': np.float64(0.8640460041397114),
                      'F1': 0.8517433313394097,
                      'FPR': np.float64(0.4432737156471259),
                      'TPR': np.float64(0.9674558938689023)},
 'XGBoost': {'AUROC': np.float64(0.9761205758976311),
             'F1': 0.9313982670709411,
             'FPR': np.float64(0.12187510486225295),
             'TPR': np.float64(0.9445160993136488)}}
