In [6]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.ensemble import GradientBoostingClassifier

# 1) Beolvasás
df = pd.read_csv("train.csv", low_memory=False)

# 2) Célváltozó szűrés
df = df[df['compliance'].isin([0, 1])].copy()
y = df['compliance'].astype(int)

# 3) Alap feature engineering függvény
def make_features(X):
    X = X.copy()
    # Dátumok parse
    for c in ['ticket_issued_date', 'hearing_date', 'payment_date']:
        if c in X.columns:
            X[c] = pd.to_datetime(X[c], errors='coerce')
    # Időjellemzők a kiadás és meghallgatás alapján
    X['issue_year'] = X['ticket_issued_date'].dt.year
    X['issue_month'] = X['ticket_issued_date'].dt.month
    X['issue_dow'] = X['ticket_issued_date'].dt.dayofweek
    X['hearing_year'] = X['hearing_date'].dt.year
    X['hearing_month'] = X['hearing_date'].dt.month
    X['hearing_dow'] = X['hearing_date'].dt.dayofweek
    X['days_to_hearing'] = (X['hearing_date'] - X['ticket_issued_date']).dt.days

    # Díjmezők kombinációi
    for c in ['fine_amount','admin_fee','state_fee','late_fee','discount_amount','clean_up_cost','judgment_amount']:
        if c not in X.columns:
            X[c] = np.nan

    X['fee_total'] = X[['fine_amount','admin_fee','state_fee','late_fee','clean_up_cost']].sum(axis=1, min_count=1)
    X['has_discount'] = (X['discount_amount'] > 0).astype(float)
    X['late_ratio'] = X['late_fee'] / (X['fine_amount'].replace(0, np.nan))
    X['cleanup_ratio'] = X['clean_up_cost'] / (X['fine_amount'].replace(0, np.nan))

    # Kategorikus mezők rendezése – csak a tipikusak, a tényleges oszlopokhoz igazítsd
    cat_cols = []
    for c in ['agency_name','disposition','violation_code','violation_description','state','city','zip_code']:
        if c in X.columns:
            cat_cols.append(c)

    # Potenciális szivárgó mezők eldobása
    leak_cols = []
    for c in ['payment_amount','payment_date','payment_status','balance_due','collection_status']:
        if c in X.columns:
            leak_cols.append(c)
    X = X.drop(columns=leak_cols, errors='ignore')

    # A transzformer számára csak kiválasztás – visszaadjuk a teljes mátrixot, a ColumnTransformer fog válogatni
    return X

feat_tf = FunctionTransformer(make_features)

# 4) Oszlopok típus szerinti listázása a transzformáció után (dinamikus illesztés miatt később határozzuk meg)
base_numeric = ['fine_amount','admin_fee','state_fee','late_fee','discount_amount','clean_up_cost','judgment_amount',
                'fee_total','late_ratio','cleanup_ratio','issue_year','issue_month','issue_dow',
                'hearing_year','hearing_month','hearing_dow','days_to_hearing']
base_categorical = ['agency_name','disposition','violation_code','violation_description','state','city','zip_code']

numeric_tf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_tf = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', min_frequency=20))
])

pre = ColumnTransformer(
    transformers=[
        ('num', numeric_tf, base_numeric),
        ('cat', categorical_tf, base_categorical)
    ],
    remainder='drop'
)

clf = GradientBoostingClassifier(random_state=42)

pipe = Pipeline(steps=[
    ('feat', feat_tf),
    ('pre', pre),
    ('model', clf)
])

X = df  # a pipe belsőleg hívja a feat_tf-t

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

pipe.fit(X_train, y_train)

# 5) Értékelés
p_valid = pipe.predict_proba(X_valid)[:,1]
roc = roc_auc_score(y_valid, p_valid)
pr = average_precision_score(y_valid, p_valid)
print(f"ROC AUC: {roc:.4f}")
print(f"PR AUC:  {pr:.4f}")

# 6) Fontos: ha szükséges, kimeneti valószínűségek és küszöb optimalizálás költségfüggvényre


ROC AUC: 0.8260
PR AUC:  0.4972
