In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the datasets
train_path = 'train.csv'
test_path = 'test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
sample_submission = pd.read_csv('sample_submission.csv')

# Display basic information
print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
train.head(), test.head()

Train Shape: (370703, 24)
Test Shape: (92676, 23)


(        id                         trans_num  trans_date trans_time  \
 0   308467  26ad750c2ff71f32631b58913582d70a  2024-01-10   06:49:39   
 1   261578  fea9c1efe3f2b97f27ad0ab5409ec861  2024-01-06   02:37:50   
 2      341  2ae350b982be840f3666273e0c2f3a05  2024-01-18   21:40:21   
 3  1147639  bbdd8adfc0a34ed0e817f809193c85c0  2024-01-21   16:20:15   
 4   314152  fc7756004dc2a9bc450eb894a670b804  2024-01-21   19:36:26   
 
     unix_time        category     amt            cc_num    first     last  \
 0  1704887379        misc_pos  188.38      676355457570   Andrea  Johnson   
 1  1704526670     grocery_pos  102.63   377178373574671   Rhonda   Chavez   
 2  1705632021   entertainment    1.62  3599292013370451  Stephen     Khan   
 3  1705872015  health_fitness    5.64  3594292572430345   Justin   Reilly   
 4  1705883786  health_fitness   97.09  4867547663675548    Alice   Duarte   
 
    ...    zip      lat      long city_pop                          job  \
 0  ...  62220  38.51

In [3]:
# Feature Engineering

# Derive age from dob
train['age'] = pd.to_datetime('2024-01-01') - pd.to_datetime(train['dob'])
train['age'] = train['age'].dt.days // 365
test['age'] = pd.to_datetime('2024-01-01') - pd.to_datetime(test['dob'])
test['age'] = test['age'].dt.days // 365

# Transaction velocity
train['trans_count'] = train.groupby('cc_num')['trans_num'].transform('count')
test['trans_count'] = test.groupby('cc_num')['trans_num'].transform('count')

# Geospatial distance
def calculate_distance(lat1, lon1, lat2, lon2):
    return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)

train['geo_distance'] = calculate_distance(train['lat'], train['long'], train['merch_lat'], train['merch_long'])
test['geo_distance'] = calculate_distance(test['lat'], test['long'], test['merch_lat'], test['merch_long'])

# Encode categorical variables
cat_columns = ['category', 'job', 'state', 'gender']
encoder = LabelEncoder()
for col in cat_columns:
    train[col] = encoder.fit_transform(train[col].astype(str))
    test[col] = encoder.transform(test[col].astype(str))

# Drop unnecessary columns
drop_cols = ['trans_num', 'trans_date', 'trans_time', 'unix_time', 'cc_num', 'dob', 'first', 'last', 'street', 'city', 'zip', 'merchant']
train = train.drop(columns=drop_cols)
test = test.drop(columns=drop_cols)

train.head(), test.head()

(        id  category     amt  gender  state      lat      long  city_pop  job  \
 0   308467         9  188.38       0     14  38.5127  -89.9847     95666    6   
 1   261578         4  102.63       0     20  39.4567  -76.9696     37941  160   
 2      341         0    1.62       1     22  45.0125  -84.6723     19515   80   
 3  1147639         5    5.64       1     35  41.1404  -81.8584     62039  377   
 4   314152         5   97.09       0      4  34.1862 -118.3009    106841  451   
 
    merch_lat  merch_long  is_fraud  age  trans_count  geo_distance  
 0  39.268874  -89.273447         0   40           37      1.038114  
 1  39.961495  -76.707640         0   47           61      0.568719  
 2  44.393561  -85.342323         0   24           24      0.912149  
 3  40.283764  -81.639361         0   93           36      0.884196  
 4  35.149704 -118.087440         0   72           29      0.986866  ,
        id  category     amt  gender  state      lat      long  city_pop  job  \
 0  

In [4]:
# Prepare Data
X = train.drop('is_fraud', axis=1)
y = train['is_fraud']

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train XGBoost Model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1, random_state=42)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Evaluate
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print('ROC AUC:', roc_auc_score(y_val, model.predict_proba(X_val)[:, 1]))

[0]	validation_0-logloss:0.25701
[1]	validation_0-logloss:0.21418
[2]	validation_0-logloss:0.18577
[3]	validation_0-logloss:0.16723
[4]	validation_0-logloss:0.15406
[5]	validation_0-logloss:0.14487
[6]	validation_0-logloss:0.13816
[7]	validation_0-logloss:0.13250
[8]	validation_0-logloss:0.12892
[9]	validation_0-logloss:0.12585
[10]	validation_0-logloss:0.12296
[11]	validation_0-logloss:0.12141
[12]	validation_0-logloss:0.12015
[13]	validation_0-logloss:0.11872
[14]	validation_0-logloss:0.11648
[15]	validation_0-logloss:0.11534
[16]	validation_0-logloss:0.11288
[17]	validation_0-logloss:0.11262
[18]	validation_0-logloss:0.11227
[19]	validation_0-logloss:0.11056
[20]	validation_0-logloss:0.11047
[21]	validation_0-logloss:0.10929
[22]	validation_0-logloss:0.10870
[23]	validation_0-logloss:0.10851
[24]	validation_0-logloss:0.10829
[25]	validation_0-logloss:0.10814
[26]	validation_0-logloss:0.10726
[27]	validation_0-logloss:0.10699
[28]	validation_0-logloss:0.10692
[29]	validation_0-loglos

In [5]:
# Generate predictions for the test dataset
# Ensure the test dataset matches the training dataset's columns
test_features = test.drop(columns=['id'], errors='ignore')  # Drop 'id' column if present
missing_cols = [col for col in X.columns if col not in test_features.columns]  # Check for missing columns

# Add any missing columns with default values
for col in missing_cols:
    test_features[col] = 0  # Use default value (0)

# Ensure the column order matches the training dataset
test_features = test_features[X.columns]

# Make predictions
test_predictions = model.predict(test_features)

# Prepare the submission file
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions
submission.to_csv('submission.csv', index=False)

print("Submission file saved as 'submission.csv'")


Submission file saved as 'submission.csv'
