In [3]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.compose import ColumnTransformer

import numpy as np
import pandas as pd
import warnings
import lightgbm as lgb
from datetime import datetime

In [4]:
def drop_ignored_columns(df, ignore_var):
    """
    Returns a DataFrame with columns from ignore_var removed (if they exist).
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        ignore_var (list): List of column names to ignore/remove.
    
    Returns:
        pd.DataFrame: DataFrame with ignored columns dropped.
    """
    # Keep only columns NOT in ignore_var
    filtered_cols = [col for col in df.columns if col not in ignore_var]
    return df[filtered_cols]

In [5]:
warnings.filterwarnings("ignore", message=".*load_learner.*insecure pickle.*")

# 2. Load the data
train_df = pd.read_csv('../Data/processed/0430_01/train_2025.csv') 
test_df = pd.read_csv('../Data/processed/0430_01/test_2025.csv') 

test_id = test_df['claim_number']
target = train_df['fraud']

ignore_var = ['claim_number', 'fraud', 'claim_date.month', 'claim_date.day', 'claim_date.dayofweek', 'claim_date.weekofyear']
train_df = drop_ignored_columns(train_df, ignore_var)
test_df = drop_ignored_columns(test_df, ignore_var)



In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37000 entries, 0 to 36999
Data columns (total 41 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age_of_driver             37000 non-null  int64  
 1   gender                    37000 non-null  object 
 2   marital_status            37000 non-null  int64  
 3   safty_rating              37000 non-null  int64  
 4   annual_income             37000 non-null  int64  
 5   high_education_ind        37000 non-null  int64  
 6   address_change_ind        37000 non-null  int64  
 7   living_status             37000 non-null  object 
 8   accident_site             37000 non-null  object 
 9   past_num_of_claims        37000 non-null  int64  
 10  witness_present_ind       37000 non-null  object 
 11  liab_prct                 37000 non-null  int64  
 12  channel                   37000 non-null  object 
 13  policy_report_filed_ind   37000 non-null  int64  
 14  claim_

In [4]:
other_cols = train_df.select_dtypes(include=object).columns
other_cols = [col for col in other_cols if col not in ignore_var]
numeric_cols = train_df.select_dtypes(include=np.number).columns
numeric_cols = [col for col in numeric_cols if col not in ignore_var]

all_cols = train_df.columns.tolist()
captured_cols = set(other_cols + numeric_cols)
missed_cols = [col for col in all_cols if col not in captured_cols]
print("Missed columns:", missed_cols)

Missed columns: []


In [8]:
lgb_clf = lgb.LGBMClassifier()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), other_cols)
    ]
)

In [9]:
lgb_clf = lgb.LGBMClassifier()
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('classifier', lgb_clf)
])

In [15]:
param_grid = {
    'classifier__num_leaves': [31, 50],
    'classifier__learning_rate': [0.1, 0.01],
    'classifier__n_estimators': [100, 200]
}

grid = GridSearchCV(pipeline, param_grid, cv=skf, scoring='f1')
grid.fit(train_df, target)

[LightGBM] [Info] Number of positive: 4629, number of negative: 24971
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3956
[LightGBM] [Info] Number of data points in the train set: 29600, number of used features: 71
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.156385 -> initscore=-1.685374
[LightGBM] [Info] Start training from score -1.685374
[LightGBM] [Info] Number of positive: 4629, number of negative: 24971
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3957
[LightGBM] [Info] Number of data points in the train set: 29600, number of used features: 71
[LightGBM] [Info] [b

In [16]:
print(f"Best f1: {grid.best_score_}")
print(f"Best params: {grid.best_params_}")

Best f1: 0.08356959070138079
Best params: {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200, 'classifier__num_leaves': 50}
