In [None]:
import os
import ast
from collections import Counter
from datetime import timedelta
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, f1_score, classification_report, roc_curve
from catboost import CatBoostClassifier

In [None]:
def safe_read_csv(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"File not found: {path}")
    return pd.read_csv(path)

In [None]:
def parse_categories_raw(cat_value):
    if pd.isna(cat_value):
        return []
    if isinstance(cat_value, (list, tuple, set)):
        return [str(x).strip() for x in cat_value if str(x).strip()!='']
    s = str(cat_value).strip()
    if s.startswith('[') and s.endswith(']'):
        try:
            parsed = ast.literal_eval(s)
            if isinstance(parsed, (list, tuple, set)):
                return [str(x).strip() for x in parsed if str(x).strip()!='']
        except Exception:
            pass
    for sep in ['|', ',', ';']:
        if sep in s:
            parts = [p.strip() for p in s.split(sep) if p.strip()!='']
            return parts
    return [s] if s!='' else []

In [None]:
def build_purchase_aggregates(purchases_df, snapshot_date=None):
    df = purchases_df.copy()
    for col in ['date', 'price', 'quantity', 'category_ids', 'client_id']:
        if col not in df.columns:
            raise KeyError(f"Purchases missing expected column: {col}")
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df['price'] = pd.to_numeric(df['price'], errors='coerce').fillna(0.0)
    df['quantity'] = pd.to_numeric(df['quantity'], errors='coerce').fillna(0.0)
    df['amount'] = df['price'] * df['quantity']

    if snapshot_date is None:
        snapshot_date = df['date'].max() + pd.Timedelta(days=1)
    else:
        snapshot_date = pd.to_datetime(snapshot_date)

    agg = df.groupby('client_id').agg(
        total_amount=('amount', 'sum'),
        total_quantity=('quantity', 'sum'),
        avg_price=('price', 'mean'),
        first_purchase=('date', 'min'),
        last_purchase=('date', 'max'),
        n_transactions=('date', 'nunique')
    ).reset_index()

    agg['recency_days'] = (snapshot_date - agg['last_purchase']).dt.days
    agg['customer_age_days'] = (snapshot_date - agg['first_purchase']).dt.days
    agg['avg_amount_per_tx'] = agg['total_amount'] / agg['n_transactions'].replace(0, np.nan)

    df['category_tokens'] = df['category_ids'].apply(parse_categories_raw)
    grouped = df.groupby('client_id')['category_tokens'].agg(list).reset_index()

    rows = []
    for _, r in grouped.iterrows():
        cid = r['client_id']
        lists = r['category_tokens']
        cnt = Counter()
        for l in lists:
            cnt.update(l)
        unique_count = len(cnt)
        if len(cnt) > 0:
            top_cat, top_cnt = cnt.most_common(1)[0]
            top3 = [c for c,_ in cnt.most_common(3)]
            top3_str = '|'.join(top3)
        else:
            top_cat, top_cnt, top3_str = None, 0, ''
        rows.append({
            'client_id': cid,
            'unique_category_count': unique_count,
            'top_category': top_cat,
            'top_category_count': top_cnt,
            'top_3_categories': top3_str
        })
    cat_df = pd.DataFrame(rows)

    agg = agg.merge(cat_df, on='client_id', how='left')

    agg['unique_category_count'] = agg['unique_category_count'].fillna(0).astype(int)
    agg['avg_price'] = agg['avg_price'].fillna(0.0)
    agg['avg_amount_per_tx'] = agg['avg_amount_per_tx'].fillna(0.0)
    agg['recency_days'] = agg['recency_days'].fillna((snapshot_date - df['date'].min()).days + 1)

    return agg

In [None]:
def assemble_features(purchase_agg, message_agg, target_df):
    df = target_df.merge(purchase_agg, on='client_id', how='left')
    df = df.merge(message_agg, on='client_id', how='left')

    df['avg_amount_per_cat'] = df['avg_amount_per_tx'] / (df['unique_category_count'].replace(0, 1))
    df['frequency'] = df['n_transactions'].fillna(0)
    df['ever_bought'] = (~df['first_purchase'].isna()).astype(int)

    if 'top_category' in df.columns:
        df['top_category'] = df['top_category'].fillna('missing').astype(str)
    else:
        df['top_category'] = 'missing'

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    df[num_cols] = df[num_cols].fillna(0.0)

    return df

In [None]:
def preprocess_for_model(X_train, X_test):
    num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

    num_imp = SimpleImputer(strategy='median')
    X_train_num = pd.DataFrame(num_imp.fit_transform(X_train[num_cols]), columns=num_cols, index=X_train.index)
    X_test_num = pd.DataFrame(num_imp.transform(X_test[num_cols]), columns=num_cols, index=X_test.index)

    X_train_cat = X_train[cat_cols].fillna('missing').astype(str).copy()
    X_test_cat = X_test[cat_cols].fillna('missing').astype(str).copy()

    X_train_prep = pd.concat([X_train_num, X_train_cat], axis=1)
    X_test_prep = pd.concat([X_test_num, X_test_cat], axis=1)

    return X_train_prep, X_test_prep, num_cols, cat_cols, num_imp

In [None]:
def find_best_threshold(y_true, y_proba):
    best_t = 0.5
    best_f1 = -1
    for t in np.linspace(0.01, 0.99, 99):
        f1 = f1_score(y_true, (y_proba >= t).astype(int))
        if f1 > best_f1:
            best_f1 = f1
            best_t = t
    return best_t, best_f1

In [None]:
def train_and_evaluate(purchases_path, messages_path, target_path, outdir='output', snapshot_date=None):
    os.makedirs(outdir, exist_ok=True)
    readme = """# Проект: предсказание покупки в течение 90 дней

Инструкция:
1. Подготовьте CSV-файлы apparel-purchases.csv, apparel-messages.csv, apparel-target_binary.csv
2. Запустите: `python apparel_purchase_pipeline.py --purchases apparel-purchases.csv --messages apparel-messages.csv --target apparel-target_binary.csv`
3. Смотрите результаты в папке output: model.joblib, feature_importances.csv, metrics.txt, roc_curve.png


"""
    req = """
pandas
numpy
scikit-learn
joblib
matplotlib
"""
    with open(os.path.join(outdir, 'README.md'), 'w', encoding='utf-8') as f:
        f.write(readme)
    with open(os.path.join(outdir, 'requirements.txt'), 'w', encoding='utf-8') as f:
        f.write(req)

    purchases = safe_read_csv(purchases_path)
    messages = safe_read_csv(messages_path)
    target = safe_read_csv(target_path)

    print("Building purchase aggregates...")
    purchase_agg = build_purchase_aggregates(purchases, snapshot_date=snapshot_date)

    print("Building message aggregates...")
    message_agg = build_message_aggregates(messages)

    print("Assembling features...")
    df = assemble_features(purchase_agg, message_agg, target)

    drop_cols = ['first_purchase', 'last_purchase']
    X = df.drop(columns=['client_id', 'target'] + [c for c in drop_cols if c in df.columns], errors='ignore')
    y = df['target']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_train_prep, X_test_prep, num_cols, cat_cols, num_imputer = preprocess_for_model(X_train, X_test)

    print("Num features:", len(num_cols), "Cat features:", len(cat_cols))

    pos = y_train.sum()
    neg = len(y_train) - pos
    if pos == 0:
        raise ValueError("No positive examples in training set!")
    weight_ratio = max(1.0, int(round(neg / pos)))
    class_weights = [1, weight_ratio]
    print("Using class_weights:", class_weights)

    cbc = CatBoostClassifier(
        eval_metric='AUC',
        loss_function='Logloss',
        random_seed=42,
        verbose=0,
        class_weights=class_weights
    )

    param_grid = {
        'depth': [4, 6],
        'learning_rate': [0.03, 0.05],
        'iterations': [300, 500]
    }
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    grid = GridSearchCV(estimator=cbc, param_grid=param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)

    cat_indices = [list(X_train_prep.columns).index(c) for c in cat_cols] if len(cat_cols) > 0 else []
    grid.fit(X_train_prep.values, y_train.values, **({'cat_features': cat_indices} if len(cat_indices)>0 else {}))

    best = grid.best_estimator_
    print("Best params:", grid.best_params_)

    y_proba = best.predict_proba(X_test_prep.values)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    best_t, best_f1 = find_best_threshold(y_test.values, y_proba)
    y_pred = (y_proba >= best_t).astype(int)

    print(f"Test ROC AUC: {auc:.4f}")
    print("Best threshold (by F1 on test):", best_t, "best_f1:", best_f1)
    print("Classification report (threshold chosen):\n", classification_report(y_test, y_pred))

    joblib.dump(best, os.path.join(outdir, 'model_catboost.joblib'))
    joblib.dump(num_imputer, os.path.join(outdir, 'num_imputer.joblib'))

    meta = {
        'num_cols': num_cols,
        'cat_cols': cat_cols,
        'features': X_train_prep.columns.tolist(),
        'best_params': grid.best_params_,
        'best_threshold': float(best_t)
    }
    joblib.dump(meta, os.path.join(outdir, 'meta.joblib'))

    try:
        fi = best.get_feature_importance(prettified=False)
        fi_df = pd.DataFrame({'feature': X_train_prep.columns.tolist(), 'importance': fi})
        fi_df.sort_values('importance', ascending=False).to_csv(os.path.join(outdir, 'feature_importances.csv'), index=False)
    except Exception:
        pass

    with open(os.path.join(outdir, 'metrics.txt'), 'w', encoding='utf-8') as f:
        f.write(f"roc_auc: {auc}\n")
        f.write(f"best_threshold: {best_t}\n")
        f.write(f"best_f1: {best_f1}\n\n")
        f.write("classification_report:\n")
        f.write(classification_report(y_test, y_pred))

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(); plt.plot(fpr, tpr); plt.xlabel('FPR'); plt.ylabel('TPR')
    plt.title(f'ROC AUC = {auc:.4f}'); plt.grid(True)
    plt.savefig(os.path.join(outdir, 'roc_curve.png')); plt.close()

    print("Artifacts saved to", outdir)
    return best, meta

In [None]:
if __name__ == "__main__":
    purchases_file = "apparel-purchases.csv"
    messages_file = "apparel-messages.csv"
    target_file = "apparel-target_binary.csv"
    output_dir = "output"
    snapshot_date = None

    train_and_evaluate(purchases_file, messages_file, target_file, outdir=output_dir, snapshot_date=snapshot_date)