In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [2]:
# Load the datasets
train_path = 'train.csv'
test_path = 'test.csv'
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

# Preview the datasets
print('Train Dataset Sample:')
print(train.head(20))
print('\nTest Dataset Sample:')
print(test.head(20))


Train Dataset Sample:
         id                         trans_num  trans_date trans_time  \
0    308467  26ad750c2ff71f32631b58913582d70a  2024-01-10   06:49:39   
1    261578  fea9c1efe3f2b97f27ad0ab5409ec861  2024-01-06   02:37:50   
2       341  2ae350b982be840f3666273e0c2f3a05  2024-01-18   21:40:21   
3   1147639  bbdd8adfc0a34ed0e817f809193c85c0  2024-01-21   16:20:15   
4    314152  fc7756004dc2a9bc450eb894a670b804  2024-01-21   19:36:26   
5     60254  5b7e5611a6e23c43cbd94fb5f1c9116c  2024-01-05   01:31:55   
6    257224  bd42a07e6f28580f596a1d57cb4f7111  2024-01-19   14:19:39   
7     76231  23d8fb3f871e608636c6dc2d7a8c8666  2024-01-06   00:38:33   
8   1086828  f933358719a54034633ddf90b9f73bd2  2024-01-12   20:46:25   
9    515538  1a4f51b2f2e3fa4c07764136a044deff  2024-01-06   03:03:30   
10   208778  91420c07859f46ac2dce3148cc318cc3  2024-01-07   15:40:07   
11   136171  88940a0cd3dbcbaf8e075dddb063043b  2024-01-05   01:39:38   
12   415403  20f9ad95aa85884302c786d609d9d

In [3]:
# Dimensions of datasets
print(f'Train Shape: {train.shape}')
print(f'Test Shape: {test.shape}')

# Check for missing values
print('Missing Values in Train:')
print(train.isnull().sum())
print('\nMissing Values in Test:')
print(test.isnull().sum())


Train Shape: (370703, 24)
Test Shape: (92676, 23)
Missing Values in Train:
id            0
trans_num     0
trans_date    0
trans_time    0
unix_time     0
category      0
amt           0
cc_num        0
first         0
last          0
gender        0
street        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
dob           0
merchant      0
merch_lat     0
merch_long    0
is_fraud      0
dtype: int64

Missing Values in Test:
id            0
trans_num     0
trans_date    0
trans_time    0
unix_time     0
category      0
amt           0
cc_num        0
first         0
last          0
gender        0
street        0
city          0
state         0
zip           0
lat           0
long          0
city_pop      0
job           0
dob           0
merchant      0
merch_lat     0
merch_long    0
dtype: int64


In [4]:
# Convert `trans_time` to seconds since midnight
train['trans_time'] = train['trans_time'].apply(lambda x: int(x.split(':')[0]) * 3600 + int(x.split(':')[1]) * 60 + int(x.split(':')[2]))
test['trans_time'] = test['trans_time'].apply(lambda x: int(x.split(':')[0]) * 3600 + int(x.split(':')[1]) * 60 + int(x.split(':')[2]))
# Feature Engineering
# Convert date and time to separate features
train['trans_date'] = pd.to_datetime(train['trans_date'])
train['year'] = train['trans_date'].dt.year
train['month'] = train['trans_date'].dt.month
train['day'] = train['trans_date'].dt.day
train['day_of_week'] = train['trans_date'].dt.dayofweek

test['trans_date'] = pd.to_datetime(test['trans_date'])
test['year'] = test['trans_date'].dt.year
test['month'] = test['trans_date'].dt.month
test['day'] = test['trans_date'].dt.day
test['day_of_week'] = test['trans_date'].dt.dayofweek

# Geographical features
train['distance'] = np.sqrt((train['lat'] - train['merch_lat'])**2 + (train['long'] - train['merch_long'])**2)
test['distance'] = np.sqrt((test['lat'] - test['merch_lat'])**2 + (test['long'] - test['merch_long'])**2)

# Encoding categorical features
encoder = LabelEncoder()
for col in ['category', 'gender', 'state', 'job', 'merchant']:
    train[col] = encoder.fit_transform(train[col].astype(str))
    test[col] = encoder.transform(test[col].astype(str))


In [5]:
# Define features and target
X = train.drop(columns=['is_fraud', 'trans_num', 'trans_date', 'first', 'last', 'street', 'city', 'zip', 'dob'])
y = train['is_fraud']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1, enable_categorical=True)
# Train XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
xgb_model.fit(X_train, y_train)

# Validate model
y_pred = xgb_model.predict(X_val)
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Precision:', precision_score(y_val, y_pred))
print('Recall:', recall_score(y_val, y_pred))
print('F1 Score:', f1_score(y_val, y_pred))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9922445070878462
Precision: 0.9817544707588207
Recall: 0.9504035559714586
F1 Score: 0.9658246656760773


In [6]:
# Prepare test data
X_test = test.drop(columns=['trans_num', 'trans_date', 'first', 'last', 'street', 'city', 'zip', 'dob'])

# Make predictions
test['is_fraud'] = xgb_model.predict(X_test)

# Create submission file
submission = test[['id', 'is_fraud']]
submission.to_csv('submission.csv', index=False)
print('Submission file created: submission.csv')

Submission file created: submission.csv
