Предразбивка данных по группам на основании похожести

In [None]:

import ast
import re
from collections import defaultdict
import numpy as np
import pandas as pd


CSV_PATH      = "../datasets/train_raw.csv" 
SEP           = ";"                         
VAL_RATIO     = 0.2                         
RANDOM_STATE  = 42


NGRAM_N       = 3      
JACCARD_THR   = 0.69   
MIN_SHARED    = 3      
MIN_LEN       = 2      
USE_PREFIX    = True   
MIN_PREFIX    = 3

SAVE_SPLIT    = True
OUT_TRAIN     = "../datasets/train_split.csv"
OUT_VAL       = "../datatets/val_split.csv"


df = pd.read_csv(CSV_PATH, sep=SEP, encoding="utf-8")
assert {"sample", "annotation"}.issubset(df.columns), "Нужны колонки: sample, annotation"


def parse_ann(cell):

    try:
        lst = ast.literal_eval(cell)
        if isinstance(lst, list):
            return lst
    except Exception:
        pass
    return []

def strip_bio(tag):
    if not isinstance(tag, str):
        return "O"
    if tag == "O":
        return "O"
    if "-" in tag:
        _, suf = tag.split("-", 1)
        return suf
    return tag

def merge_entities(text, ann_list):
    spans = []
    for s, e, t in ann_list:
        t2 = strip_bio(t)
        spans.append((int(s), int(e), t2))
    spans.sort(key=lambda x: x[0])

    merged = []
    cur_s = cur_e = None
    cur_lab = None

    for s, e, lab in spans:
        if lab == "O":
            if cur_lab is not None:
                merged.append((cur_s, cur_e, cur_lab))
                cur_s = cur_e = cur_lab = None
            continue

        if cur_lab is None:
            cur_s, cur_e, cur_lab = s, e, lab
        else:
            if lab == cur_lab and (s <= cur_e + 1):
                cur_e = max(cur_e, e)
            else:
                merged.append((cur_s, cur_e, cur_lab))
                cur_s, cur_e, cur_lab = s, e, lab

    if cur_lab is not None:
        merged.append((cur_s, cur_e, cur_lab))
    out = []
    for s, e, lab in merged:
        seg = text[s:e]
        out.append((seg, lab))
    return out

rus_to_lat_map = str.maketrans({"ё": "е"})

def normalize_token(t: str) -> str:
    t = t.lower().translate(rus_to_lat_map)
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"[^0-9a-zа-я% ]+", "", t) 
    return t.strip()

def normalize_text(s: str) -> str:
    return " ".join(normalize_token(tok) for tok in s.split())


signature_type = []
all_type_terms = set()

for i, row in df.iterrows():
    text = str(row["sample"])
    ann  = parse_ann(row["annotation"])
    ents = merge_entities(text, ann)
    type_pieces = []
    for seg, lab in ents:
        if lab == "TYPE":
            type_pieces.append(normalize_text(seg))
    if type_pieces:
        sig = " ".join(type_pieces)            
        sig = re.sub(r"\s+", " ", sig).strip()
        signature_type.append(sig if sig else None)
        if sig:
            all_type_terms.add(sig)
    else:
        signature_type.append(None)

def ngrams(s: str, n: int) -> set:
    s2 = f"^{s}$"
    return {s2[i:i+n] for i in range(max(0, len(s2) - n + 1))}

def jaccard(a: set, b: set) -> float:
    inter = len(a & b)
    if inter == 0:
        return 0.0
    return inter / len(a | b)

class UF:
    def __init__(self, xs):
        self.p = {x: x for x in xs}
        self.r = {x: 0 for x in xs}
    def f(self, x):
        if self.p[x] != x:
            self.p[x] = self.f(self.p[x])
        return self.p[x]
    def u(self, a, b):
        ra, rb = self.f(a), self.f(b)
        if ra == rb: return False
        if self.r[ra] < self.r[rb]:
            ra, rb = rb, ra
        self.p[rb] = ra
        if self.r[ra] == self.r[rb]:
            self.r[ra] += 1
        return True

terms = sorted([t for t in all_type_terms if len(t) >= MIN_LEN])
shing = {t: ngrams(t, NGRAM_N) for t in terms}

inv = defaultdict(list)
for t, ss in shing.items():
    for g in ss:
        inv[g].append(t)

uf = UF(terms)

for t in terms:
    cand_counts = defaultdict(int)
    ss = shing[t]
    for g in ss:
        for other in inv[g]:
            if other == t:
                continue
            cand_counts[other] += 1

    for other, c in cand_counts.items():
        if c < MIN_SHARED:
            continue

        if USE_PREFIX and (t.startswith(other) or other.startswith(t)):
            if min(len(t), len(other)) >= MIN_PREFIX:
                uf.u(t, other)
                continue


        if jaccard(ss, shing[other]) >= JACCARD_THR:
            uf.u(t, other)


type_cluster_id = {}
root_to_terms = defaultdict(list)
for t in terms:
    r = uf.f(t)
    root_to_terms[r].append(t)
for root, members in root_to_terms.items():
    cid = f"TYPE|{root}"
    for m in members:
        type_cluster_id[m] = cid

groups = []
no_type_count = 0
for i, sig in enumerate(signature_type):
    if sig is None:
        groups.append(f"NO-TYPE|{i}")
        no_type_count += 1
    else:
        groups.append(type_cluster_id.get(sig, f"TYPE-UNSEEN|{sig}"))

print(f"Строк без TYPE: {no_type_count} из {len(df)}")

idx_all = np.arange(len(df))
groups_arr = np.array(groups)

try:
    from sklearn.model_selection import GroupShuffleSplit
    gss = GroupShuffleSplit(n_splits=1, test_size=VAL_RATIO, random_state=RANDOM_STATE)
    train_idx, val_idx = next(gss.split(idx_all, groups=groups_arr))
except Exception as e:
    print("sklearn недоступен, используем резервный сплит. Ошибка:", e)
    rng = np.random.default_rng(RANDOM_STATE)
    uniq_groups = pd.unique(groups_arr)
    rng.shuffle(uniq_groups)
    counts = pd.Series(groups_arr).value_counts()
    target_val = int(len(df) * VAL_RATIO)
    picked, acc = [], 0
    for g in uniq_groups:
        c = int(counts[g])
        if acc + c <= target_val:
            picked.append(g)
            acc += c
        if acc >= target_val:
            break
    val_mask = np.isin(groups_arr, picked)
    val_idx = np.where(val_mask)[0]
    train_idx = np.where(~val_mask)[0]

print(f"Train: {len(train_idx)}  Val: {len(val_idx)}  (val ~ {len(val_idx)/len(df):.3f})")


train_types = set()
val_types   = set()
for i in train_idx:
    if signature_type[i] is not None:
        train_types.add(type_cluster_id.get(signature_type[i], signature_type[i]))
for i in val_idx:
    if signature_type[i] is not None:
        val_types.add(type_cluster_id.get(signature_type[i], signature_type[i]))

leak = train_types & val_types
print(f"Пересечение TYPE-кластеров между train/val: {len(leak)}")  


if SAVE_SPLIT:
    df.iloc[train_idx].to_csv(OUT_TRAIN, sep=SEP, index=False, encoding="utf-8")
    df.iloc[val_idx].to_csv(OUT_VAL,   sep=SEP, index=False, encoding="utf-8")
    print(f"Сохранено: {OUT_TRAIN} ({len(train_idx)}), {OUT_VAL} ({len(val_idx)})")


Строк без TYPE: 2751 из 27249
Train: 21959  Val: 5290  (val ~ 0.194)
Пересечение TYPE-кластеров между train/val: 0
Сохранено: train_split.csv (21959), val_split.csv (5290)
