<a href="https://colab.research.google.com/github/SergeiLab/fraud-detection/blob/main/fraud-detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score, f1_score
import numpy as np

# Загрузка
train = pd.read_csv('train.csv')
test = pd.read_csv('test_public.csv')

# Дата и сортировка
train['tx_datetime'] = pd.to_datetime(train['tx_datetime'])
test['tx_datetime'] = pd.to_datetime(test['tx_datetime'])
train = train.sort_values('tx_datetime').reset_index(drop=True)

# risk_profile → числовой
risk_map = {'low': 0, 'mid': 1, 'high': 2}
train['risk_profile'] = train['risk_profile'].map(risk_map)
test['risk_profile'] = test['risk_profile'].map(risk_map)

# === Безопасная генерация признаков ===
def safe_feature_engineering(df):
    df = df.copy()
    df['high_ratio_flag'] = (df['ratio_to_avg'] > 3).astype(int)
    df['very_high_amount'] = (df['amount'] > df['avg_amount_7d'] * 5).astype(int)
    df['suspicious_combo'] = (
        (df['is_night'] == 1) &
        (df['is_international'] == 1) &
        (df['amount'] > df['amount'].quantile(0.9))
    ).astype(int)
    return df

train = safe_feature_engineering(train)
test = safe_feature_engineering(test)

# === Хронологический сплит ===
val_size = int(0.2 * len(train))
train_part = train.iloc[:-val_size]
val_part = train.iloc[-val_size:]

# Подготовка
drop_cols = ['tx_id', 'customer_id', 'tx_datetime', 'fraud']
X_train = train_part.drop(columns=drop_cols)
y_train = train_part['fraud']
X_val = val_part.drop(columns=drop_cols)
y_val = val_part['fraud']
X_test = test.drop(columns=['tx_id', 'customer_id', 'tx_datetime'])

# Категории
cat_features = ['risk_profile', 'channel', 'mcc', 'country', 'home_country']

# Обучение
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.03,
    depth=6,
    eval_metric='PRAUC',
    auto_class_weights='Balanced',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

# Предсказание (порог 0.5 пока оставим)
y_val_proba = model.predict_proba(X_val)[:, 1]
fraud_proba = y_val_proba[y_val == 1]

if len(fraud_proba) > 0:
    # Берём нижнюю границу: хотим ловить даже слабые сигналы
    best_thr = np.percentile(fraud_proba, 20)  # или 10, 15
else:
    best_thr = 0.1

print(f"Using threshold: {best_thr:.4f}")

# Предсказание на тесте
y_test_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_test_proba >= best_thr).astype(int)

# Сохранение
pd.DataFrame({'tx_id': test['tx_id'], 'fraud': y_pred}).to_csv('submission_.csv', index=False)

0:	learn: 0.7294076	test: 0.6380773	best: 0.6380773 (0)	total: 24.2ms	remaining: 24.2s
100:	learn: 0.8219848	test: 0.7384247	best: 0.7392419 (96)	total: 1.61s	remaining: 14.3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7404435149
bestIteration = 127

Shrink model to first 128 iterations.
Using threshold: 0.4329
