In [None]:
import pandas as pd
import numpy as np

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return c


train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train['trans_date'] = pd.to_datetime(train['trans_date'])
test['trans_date'] = pd.to_datetime(test['trans_date'])
train['trans_time'] = pd.to_datetime(train['trans_time'], format='%H:%M:%S')
test['trans_time'] = pd.to_datetime(test['trans_time'], format='%H:%M:%S')
train['dob'] = pd.to_datetime(train['dob'])
test['dob'] = pd.to_datetime(test['dob'])
train['year'] = train['trans_date'].dt.year
test['year'] = test['trans_date'].dt.year
train['month'] = train['trans_date'].dt.month
test['month'] = test['trans_date'].dt.month
train['day'] = train['trans_date'].dt.day
test['day'] = test['trans_date'].dt.day
train['dow'] = train['trans_date'].dt.dayofweek
test['dow'] = test['trans_date'].dt.dayofweek
train['hour'] = train['trans_time'].dt.hour
test['hour'] = test['trans_time'].dt.hour
train['age'] = (train['trans_date'] - train['dob']).dt.days // 12
test['age'] = (test['trans_date'] - test['dob']).dt.days // 12

train['distance'] = haversine(train['lat'], train['long'], train['merch_lat'], train['merch_long'])
test['distance'] = haversine(test['lat'], test['long'], test['merch_lat'], test['merch_long'])

categorical_cols = ['category', 'gender', 'job', 'merchant', 'city', 'state', 'month', 'hour', 'dow']
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(pd.concat([train[col], test[col]]))
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

train['avg_fraud_per_cc_num'] = train.groupby('cc_num')['is_fraud'].transform('mean')
test = test.merge(train[['cc_num', 'avg_fraud_per_cc_num']].drop_duplicates(), on='cc_num', how='left')

train['std_fraud_per_cc_num'] = train.groupby('cc_num')['is_fraud'].transform('std')
test = test.merge(train[['cc_num', 'std_fraud_per_cc_num']].drop_duplicates(), on='cc_num', how='left')

# average fraud value per First Last name
train['avg_fraud_per_name'] = train.groupby(['first', 'last'])['is_fraud'].transform('mean')
test = test.merge(train[['first', 'last', 'avg_fraud_per_name']].drop_duplicates(), on=['first', 'last'], how='left')

train['std_fraud_per_name'] = train.groupby(['first', 'last'])['is_fraud'].transform('std')
test = test.merge(train[['first', 'last', 'std_fraud_per_name']].drop_duplicates(), on=['first', 'last'], how='left')

# cc num transactions count
train['cc_num_count'] = train.groupby('cc_num')['cc_num'].transform('count')
test = test.merge(train[['cc_num', 'cc_num_count']].drop_duplicates(), on='cc_num', how='left')

# get some fraud statistics for some columns: count, mean, std
# category
train['category_count'] = train.groupby('category')['category'].transform('count')
test = test.merge(train[['category', 'category_count']].drop_duplicates(), on='category', how='left')

train['category_mean'] = train.groupby('category')['is_fraud'].transform('mean')
test = test.merge(train[['category', 'category_mean']].drop_duplicates(), on='category', how='left')

train['category_std'] = train.groupby('category')['is_fraud'].transform('std')
test = test.merge(train[['category', 'category_std']].drop_duplicates(), on='category', how='left')

# merchant
train['merchant_count'] = train.groupby('merchant')['merchant'].transform('count')
test = test.merge(train[['merchant', 'merchant_count']].drop_duplicates(), on='merchant', how='left')

train['merchant_mean'] = train.groupby('merchant')['is_fraud'].transform('mean')
test = test.merge(train[['merchant', 'merchant_mean']].drop_duplicates(), on='merchant', how='left')

train['merchant_std'] = train.groupby('merchant')['is_fraud'].transform('std')
test = test.merge(train[['merchant', 'merchant_std']].drop_duplicates(), on='merchant', how='left')

# city
train['city_count'] = train.groupby('city')['city'].transform('count')
test = test.merge(train[['city', 'city_count']].drop_duplicates(), on='city', how='left')

train['city_mean'] = train.groupby('city')['is_fraud'].transform('mean')
test = test.merge(train[['city', 'city_mean']].drop_duplicates(), on='city', how='left')

train['city_std'] = train.groupby('city')['is_fraud'].transform('std')
test = test.merge(train[['city', 'city_std']].drop_duplicates(), on='city', how='left')

# state
train['state_count'] = train.groupby('state')['state'].transform('count')
test = test.merge(train[['state', 'state_count']].drop_duplicates(), on='state', how='left')

train['state_mean'] = train.groupby('state')['is_fraud'].transform('mean')
test = test.merge(train[['state', 'state_mean']].drop_duplicates(), on='state', how='left')

train['state_std'] = train.groupby('state')['is_fraud'].transform('std')
test = test.merge(train[['state', 'state_std']].drop_duplicates(), on='state', how='left')

# job
train['job_count'] = train.groupby('job')['job'].transform('count')
test = test.merge(train[['job', 'job_count']].drop_duplicates(), on='job', how='left')

train['job_mean'] = train.groupby('job')['is_fraud'].transform('mean')
test = test.merge(train[['job', 'job_mean']].drop_duplicates(), on='job', how='left')

train['job_std'] = train.groupby('job')['is_fraud'].transform('std')
test = test.merge(train[['job', 'job_std']].drop_duplicates(), on='job', how='left')


# fill nan with -1
train.fillna(-1, inplace=True)
test.fillna(-1, inplace=True)

drop_cols = ['trans_num', 'trans_date', 'trans_time', 'unix_time', 'first', 'last', 'street', 'zip', 'dob', 'lat', 'long', 'merch_lat', 'merch_long']
train.drop(columns=drop_cols, inplace=True)
test.drop(columns=drop_cols, inplace=True)

X = train.drop(columns=['is_fraud'])
y = train['is_fraud']

# Define the model
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# Smote pipeline + classifier
pipeline = Pipeline(steps=[
    ('smote', SMOTETomek(random_state=42, n_jobs=-1)),
    ('classifier', model)
])

# Train test split for evaluation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train the model
pipeline.fit(X_train, y_train)

# Evaluate the model
y_val_pred = pipeline.predict(X_val)
print(f"Validation F1-Score: {f1_score(y_val, y_val_pred):.5f}")

# Train on full dataset and predict on test set
y_test_pred = pipeline.predict(test)

# Save predictions to submission file
submission = test[['id']].copy()
submission['is_fraud'] = y_test_pred
submission.to_csv("submissionval.csv", index=False)


# Train on full dataset and predict on test set
pipeline.fit(X, y)
y_test_pred = pipeline.predict(test)

# Save predictions to submission file
submission = test[['id']].copy()
submission['is_fraud'] = y_test_pred
submission.to_csv("submissionfinal.csv", index=False)

In [5]:
X_train_resampled, y_train_resampled = SMOTETomek(random_state=42, n_jobs=-1).fit_resample(X, y)

In [8]:
# fit resample and use flaml automl
from flaml import AutoML
automl = AutoML()
settings = {
    "time_budget": 120,  # total running time in seconds
    "metric": 'f1',
    "task": 'classification',
}

# automl.fit(X_train_resampled, y_train_resampled, **settings)
# use test data for eval
automl.fit(X_train_resampled, y_train_resampled, X_val, y_val, **settings)


[flaml.automl.logger: 12-08 22:45:54] {1728} INFO - task = classification
[flaml.automl.logger: 12-08 22:45:54] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-08 22:45:54] {1838} INFO - Minimizing error metric: 1-f1
[flaml.automl.logger: 12-08 22:45:54] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 12-08 22:45:54] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-08 22:45:55] {2393} INFO - Estimated sufficient time budget=44703s. Estimated necessary time budget=1035s.
[flaml.automl.logger: 12-08 22:45:55] {2442} INFO -  at 4.3s,	estimator lgbm's best error=0.1447,	best estimator lgbm's best error=0.1447
[flaml.automl.logger: 12-08 22:45:55] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-08 22:45:55] {2442} INFO -  at 4.3s,	estimator lgbm's best error=0.1348,	best estimator lgbm's best error=0.1348
[flaml.automl.logger: 12-

In [9]:
# generate predictions for submission
y_test_pred = automl.predict(test)
submission = test[['id']].copy()
submission['is_fraud'] = y_test_pred
submission.to_csv("submissionflaml.csv", index=False)

In [16]:
# automl on base dataset with class weighting
bautoml = AutoML()
settingss = {
    "time_budget": 120,  # total running time in seconds
    "metric": 'f1',
    "task": 'classification',
    'early_stop': True,
    'sample_weight': y.value_counts(normalize=True).to_dict()
}

# automl.fit(X_train_resampled, y_train_resampled, **settings)
# use test data for eval
bautoml.fit(X, y, **settings)

[flaml.automl.logger: 12-08 22:55:24] {1728} INFO - task = classification
[flaml.automl.logger: 12-08 22:55:24] {1739} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-08 22:55:25] {1838} INFO - Minimizing error metric: 1-f1
[flaml.automl.logger: 12-08 22:55:25] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'lrl1']
[flaml.automl.logger: 12-08 22:55:25] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-08 22:55:25] {2393} INFO - Estimated sufficient time budget=21372s. Estimated necessary time budget=495s.
[flaml.automl.logger: 12-08 22:55:25] {2442} INFO -  at 2.1s,	estimator lgbm's best error=1.0000,	best estimator lgbm's best error=1.0000
[flaml.automl.logger: 12-08 22:55:25] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-08 22:55:25] {2442} INFO -  at 2.2s,	estimator lgbm's best error=1.0000,	best estimator lgbm's best error=1.0000
[flaml.automl.logger: 12-0

In [17]:
# generate predictions for submission
y_test_pred = bautoml.predict(test)
submission = test[['id']].copy()
submission['is_fraud'] = y_test_pred
submission.to_csv("submissionflamlbalance.csv", index=False)