In [None]:
from pathlib import Path
import sys

# Текущая рабочая директория Jupyter
cwd = Path.cwd()

# Если CWD = Project/notebooks, то корень проекта = parent
project_root = str(cwd.parent)

# Добавляем корень проекта в sys.path (в начало, чтобы он имел приоритет)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from config import DB_PATH, NOTEBOOKS_DIR 

BASE = NOTEBOOKS_DIR

In [None]:
# !curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
# !sudo apt-get install -y git-lfs
# !git lfs install

In [None]:
import yaml


cfg_path = BASE / "default.yaml"
with open(cfg_path, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

In [None]:
import sqlite3, pandas as pd

sql = cfg["data"]["sql"]

with sqlite3.connect(DB_PATH) as conn:
    df = pd.read_sql(sql, conn)

print(df.head())

In [None]:
import ast, json


def _parse_maybe_list(x : str) -> list:
    if x is None:
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        # JSON list
        try:
            v = json.loads(s)
            if isinstance(v, list):
                return v
        except Exception:
            pass
        # Python literal list (безопаснее, чем eval)
        try:
            v = ast.literal_eval(s)
            if isinstance(v, list):
                return v
        except Exception:
            pass
        # fallback: whitespace tokens
        return s.split()
    # fallback: try iter
    try:
        return list(x)
    except Exception:
        return []
    
def _identity(x):
    return x

In [None]:
import numpy as np


def _tokens_col_to_text(X : np.ndarray) -> np.ndarray:
    """ColumnTransformer отдаёт (n_samples, 1) -> вернём массив строк (n_samples,)"""
    col = np.asarray(X).reshape(-1)
    out = [" ".join(map(str, _parse_maybe_list(v))) for v in col]
    return np.asarray(out, dtype=object)

def _numeric_col_to_matrix(X : np.ndarray) -> np.ndarray:
    """Парсит ctx_numeric (ожидается фиксированная длина списка чисел)"""
    col = np.asarray(X).reshape(-1)
    rows = [np.asarray(_parse_maybe_list(v), dtype=float) for v in col]
    # если длины разные — лучше падать явно
    lens = {r.shape[0] for r in rows}
    if len(lens) != 1:
        raise ValueError(f"ctx_numeric has varying lengths: {sorted(lens)}")
    return np.vstack(rows)


PREPROCESSORS = {
    "identity": _identity,
}
TOKENIZERS = {
    "identity": _identity,
}

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer


def build_text_tokens(field_cfg : dict, *, lowercase : bool, naive_bayes_compatible : bool) -> Pipeline:
    mode = field_cfg["mode"]
    ngram_range = tuple(field_cfg["ngram_range"])
    token_pattern = field_cfg["token_pattern"]

    tokenizer_name = field_cfg["tokenizer"]
    preprocessor_name = field_cfg["preprocessor"]
    preprocessor = PREPROCESSORS[preprocessor_name]
    tokenizer = TOKENIZERS[tokenizer_name]

    expects_tokens = (tokenizer_name == "identity") or (preprocessor_name == "identity") or (token_pattern is None)
    to_text = FunctionTransformer(_tokens_col_to_text, validate=False) if expects_tokens else "passthrough"

    if mode == "CountVectorizer":
        base_params = dict(field_cfg["CountVectorizer"])
        vect = CountVectorizer(
            lowercase=lowercase,
            ngram_range=ngram_range,
            preprocessor=preprocessor,
            tokenizer=tokenizer,
            token_pattern=token_pattern,
            **base_params,
        )
    elif mode == "HashingVectorizer":
        base_params = dict(field_cfg["HashingVectorizer"])
        if naive_bayes_compatible:
            base_params["alternate_sign"] = False  # чтобы не получить отрицательные значения
        vect = HashingVectorizer(
            lowercase=lowercase,
            ngram_range=ngram_range,
            preprocessor=preprocessor,
            tokenizer=tokenizer,
            token_pattern=token_pattern,
            **base_params,
        )
    elif mode == "TfidfVectorizer":
        base_params = dict(field_cfg["TfidfVectorizer"])
        vect = TfidfVectorizer(
            lowercase=lowercase,
            ngram_range=ngram_range,
            preprocessor=preprocessor,
            tokenizer=tokenizer,
            token_pattern=token_pattern,
            **base_params,
        )
    else:
        raise ValueError(f"Unsupported mode: {mode}")

    steps = []
    if to_text != "passthrough":
        steps.append(("to_text", to_text))
    steps.append(("vect", vect))
    return Pipeline(steps)


def build_numeric_dict(field_cfg : dict) -> Pipeline:
    mode = field_cfg["mode"]
    scale = field_cfg["scale"]

    if mode == "DictVectorizer":
        base_params = dict(field_cfg["vectorizers"]["DictVectorizer"])
        dv = DictVectorizer(**base_params)
    else:
        raise ValueError(f"Unsupported mode: {mode}")
    
    if scale == "StandardScaler":
        base_params = dict(field_cfg["scaling"]["StandardScaler"])
        sc = StandardScaler(**base_params)
    else:
        raise ValueError(f"Unsupported scaling: {scale}")
    
    ctx_vec = Pipeline([
            ("select", FunctionTransformer(lambda s: list(s), validate=False)),  # если подаёте уже list[dict], не нужно
            ("dv", dv),
            ("scale", sc),
        ])
    return ctx_vec

In [None]:
from sklearn.compose import ColumnTransformer


def build_vectorizer_from_cfg(cfg : dict, *, col_map : dict[str, str] | None = None) -> ColumnTransformer:
    features = cfg["features"]
    lowercase = bool(features["lowercase"])
    nb_ok = bool(features["naive_bayes_compatible"])

    # дефолтное соответствие: под ваш SQL SELECT
    col_map = col_map or {
        "for_ctx_numeric": "ctx_numeric",
        "for_ctx_tokens": "ctx_tokens",
        "for_error_text": "error_text_tokens",
    }

    tr = []

    tr.append((
        "source_code",
        build_text_tokens(features["for_ctx_tokens"], lowercase=lowercase, naive_bayes_compatible=nb_ok),
        col_map["for_ctx_tokens"],
    ))

    tr.append((
        "error_text",
        build_text_tokens(features["for_error_text"], lowercase=lowercase, naive_bayes_compatible=nb_ok),
        col_map["for_error_text"],
    ))

    tr.append((
        "ctx_numeric",
        build_numeric_dict(features["for_ctx_numeric"]),
    ))

    return ColumnTransformer(transformers=tr, remainder="drop")

In [None]:
import yaml, sqlite3, pandas as pd


with open("configs/default.yaml", "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

with sqlite3.connect("db/app.db") as conn:
    df = pd.read_sql(cfg["data"]["sql"], conn)

y = df["label"].astype(int).to_numpy()

feat = build_vectorizer_from_cfg(cfg)
X = feat.fit_transform(df)

print(X.shape, y.shape)