In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
import xgboost as xgb

In [2]:
# Section 1: Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Section 2: Explore the dataset
# print("Train Data Overview:")
# print(train_data.info())
# print(train_data.head())
# print(train_data.describe())

# # Check for missing values
# print("Missing Values:")
# print(train_data.isnull().sum())

# Section 3: Preprocessing
# Haversine function to calculate distance between two geo-coordinates
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Combine preprocessing for both train and test data
def preprocess_data(data, is_train=True, fraud_counts=None):
    if is_train:
        # Group by `cc_num` and calculate fraud counts
        fraud_counts = data.groupby(['cc_num', 'is_fraud']).size().unstack(fill_value=0).reset_index()
        fraud_counts.columns = ['cc_num', 'is_fraud_0_count', 'is_fraud_1_count']
        # Add a new column for fraud_score
        fraud_counts['fraud_score'] = (fraud_counts['is_fraud_0_count'] * 10) - (fraud_counts['is_fraud_1_count'] * 50)

    # Merge fraud counts into the data
    data = data.merge(fraud_counts, on='cc_num', how='left')


    # Convert dates to datetime
    data['trans_date'] = pd.to_datetime(data['trans_date'], errors='coerce')
    data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
    data['trans_time'] = pd.to_datetime(data['trans_time'], format='%H:%M:%S')

    # Feature engineering:
    data['age'] = (pd.Timestamp.now() - data['dob']).dt.days // 366
    data['trans_hour'] = data['trans_time'].dt.hour
    data['trans_minute'] = data['trans_time'].dt.minute
    data['trans_second'] = data['trans_time'].dt.second
    data['trans_time_seconds'] = data['trans_time'].dt.hour * 3600 + data['trans_time'].dt.minute * 60 + data['trans_time'].dt.second
    data['seconds_from_midnight'] = 43200 - abs(43200 - data['trans_time_seconds'])
    data['hours_from_midnight'] = 12 - abs(12 - data['trans_hour'])
    data['day_of_week'] = data['trans_date'].dt.dayofweek

    # Feature engineering: Calculate distance between cardholder and merchant
    data['haversine_distance'] = haversine(
        data['lat'], data['long'], data['merch_lat'], data['merch_long']
    )

    # Drop unnecessary columns, including raw datetime and location fields
    columns_to_drop = ['id', 'trans_num', 'cc_num', 'first', 'last', 'street', 'dob', 'trans_date', 'city', 'zip', 'city_pop']
    columns_to_drop += ['lat', 'long', 'merch_lat', 'merch_long']
    columns_to_drop += ['trans_time', 'trans_hour', 'trans_minute', 'trans_second', 'trans_time_seconds', 'hours_from_midnight']
    # columns_to_drop += ['fraud_score', 'is_fraud_0_count', 'is_fraud_1_count']
    columns_to_drop += ['fraud_score']
    # columns_to_drop += ['category', 'gender', 'state', 'job', 'merchant']
    columns_to_drop += ['haversine_distance']
    data = data.drop(columns=columns_to_drop, axis=1)

    # Convert categorical columns to dummy variables
    categorical_cols = ['category', 'gender', 'state', 'job', 'merchant']
    # categorical_cols = ['category', 'gender', 'state', 'job']
    data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

    # Ensure all remaining columns are numeric
    data = data.apply(pd.to_numeric, errors='coerce')

    if is_train:
        return data, fraud_counts
    else:
        return data

# Preprocess train data
train_data, fraud_counts = preprocess_data(train_data, is_train=True)

# Separate features and target
X = train_data.drop('is_fraud', axis=1)
y = train_data['is_fraud']

# Preprocess test data
test_data = preprocess_data(test_data, is_train=False, fraud_counts=fraud_counts)

# Ensure the test data has the same columns as training data
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X.columns]


In [None]:
print(train_data.columns)
print(test_data.columns)
# print(train_data.head())

# print(test_data.head())

In [None]:
def AnalyzeFeature(df, feature):
    # Group by 'Score' and calculate the average of the specified feature for each score
    avg_feature_by_score = df.groupby('is_fraud')[feature].mean()

    print(f"Average {feature} by Score:")
    print(avg_feature_by_score)

    # Correlation between the specified feature and 'Score'
    correlation = df[[feature, 'is_fraud']].corr()

    print(f"Correlation between {feature} and Score:")
    print(correlation)
    
Features = []
for ft in Features:
  AnalyzeFeature(train_data, ft)

In [4]:
# Section 4: Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)

In [5]:
m2 = model = xgb.XGBClassifier(eval_metric='logloss', random_state=0)
m2.fit(X_train, y_train)
y2 = m2.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y2))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y2))

print("F1 Score:", f1_score(y_val, y2))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     65840
           1       0.98      0.96      0.97      8301

    accuracy                           0.99     74141
   macro avg       0.99      0.98      0.98     74141
weighted avg       0.99      0.99      0.99     74141

Confusion Matrix:
[[65713   127]
 [  336  7965]]
F1 Score: 0.9717562374184103


In [None]:
# Section 7: Make predictions on the test dataset
test_predictions = m2.predict(test_data)

# Section 8: Create a submission file
submission = pd.DataFrame({'id': sample_submission['id'], 'is_fraud': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")