In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score
import xgboost as xgb

In [None]:
# Section 1: Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# Section 2: Explore the dataset
# print("Train Data Overview:")
# print(train_data.info())
# print(train_data.head())
# print(train_data.describe())

# # Check for missing values
# print("Missing Values:")
# print(train_data.isnull().sum())

# Section 3: Preprocessing
# Haversine function to calculate distance between two geo-coordinates
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the Earth in kilometers
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Combine preprocessing for both train and test data
def preprocess_data(data, is_train=True, fraud_counts=None):
    if is_train:
        # Group by `cc_num` and calculate fraud counts
        fraud_counts = data.groupby(['cc_num', 'is_fraud']).size().unstack(fill_value=0).reset_index()
        fraud_counts.columns = ['cc_num', 'is_fraud_0_count', 'is_fraud_1_count']
        # Add a new column for fraud_score
        fraud_counts['fraud_score'] = (fraud_counts['is_fraud_0_count'] * 10) - (fraud_counts['is_fraud_1_count'] * 50)

    # Merge fraud counts into the data
    data = data.merge(fraud_counts, on='cc_num', how='left')

    data['trans_datetime'] = pd.to_datetime(data['trans_date'] + ' ' + data['trans_time'])
    data['dob'] = pd.to_datetime(data['dob'], errors='coerce')
    data['age'] = (data['trans_datetime'] - data['dob']).dt.days / 365.25
    data['second'] = data['trans_datetime'].dt.hour
    data['minute'] = data['trans_datetime'].dt.hour
    data['hour'] = data['trans_datetime'].dt.hour
    data['day'] = data['trans_datetime'].dt.day
    data['month'] = data['trans_datetime'].dt.month
    data['weekday'] = data['trans_datetime'].dt.weekday
    
    data['trans_time_seconds'] = data['trans_datetime'].dt.hour * 3600 + data['trans_datetime'].dt.minute * 60 + data['trans_datetime'].dt.second
    data['seconds_from_midnight'] = 43200 - abs(43200 - data['trans_time_seconds'])

    # Feature engineering: Calculate distance between cardholder and merchant
    data['haversine_distance'] = haversine(
        data['lat'], data['long'], data['merch_lat'], data['merch_long']
    )

    features = [
    'amt', 'gender', 'category', 'job', 'state','city_pop',
    'hour', 'day', 'month', 'weekday',
    'is_fraud_0_count', 'is_fraud_1_count', 'seconds_from_midnight'
    'age', 'haversine_distance'
    ]
    
    if(is_train):
        features += ['is_fraud']
        
    data = data[features]

    # Convert categorical columns to dummy variables
    categorical_cols = ['category', 'state', 'job']
    gender_map = {'F': 0, 'M': 1}
    data['gender'] = data['gender'].map(gender_map)
    
    for col in categorical_cols:
        le = LabelEncoder()
        combined_data = pd.concat([data[col]], axis=0).astype(str)
        le.fit(combined_data)
        data[col] = le.transform(data[col].astype(str))

    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=features)

    if is_train:
        return data, fraud_counts
    else:
        return data

# Preprocess train data
train_data, fraud_counts = preprocess_data(train_data, is_train=True)

# Separate features and target
X = train_data.drop('is_fraud', axis=1)
y = train_data['is_fraud']

# Preprocess test data
test_data = preprocess_data(test_data, is_train=False, fraud_counts=fraud_counts)

# Ensure the test data has the same columns as training data
missing_cols = set(X.columns) - set(test_data.columns)
for col in missing_cols:
    test_data[col] = 0
test_data = test_data[X.columns]


In [53]:
print(train_data.columns)
print(train_data.head())

Index(['amt', 'gender', 'category', 'job', 'state', 'city_pop', 'hour', 'day',
       'month', 'weekday', 'is_fraud_0_count', 'is_fraud_1_count', 'age',
       'haversine_distance', 'is_fraud'],
      dtype='object')
      amt  gender  category    job  state  city_pop  hour   day  month  \
0  188.38     0.0       9.0    6.0   14.0   95666.0   6.0  10.0    1.0   
1  102.63     0.0       4.0  160.0   20.0   37941.0   2.0   6.0    1.0   
2    1.62     1.0       0.0   80.0   22.0   19515.0  21.0  18.0    1.0   
3    5.64     1.0       5.0  377.0   35.0   62039.0  16.0  21.0    1.0   
4   97.09     0.0       5.0  451.0    4.0  106841.0  19.0  21.0    1.0   

   weekday  is_fraud_0_count  is_fraud_1_count        age  haversine_distance  \
0      2.0              37.0               0.0  40.626968          104.206730   
1      5.0              51.0              10.0  47.091034           60.438265   
2      3.0              21.0               3.0  24.402464           86.836599   
3      6.0    

In [55]:
# Section 4: Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=4)

In [56]:
m2 = model = xgb.XGBClassifier(random_state=4, eval_metric='logloss')
m2.fit(X_train, y_train)
y2 = m2.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y2))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y2))

print("F1 Score:", f1_score(y_val, y2))

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     65840
         1.0       0.99      0.97      0.98      8301

    accuracy                           1.00     74141
   macro avg       0.99      0.99      0.99     74141
weighted avg       1.00      1.00      1.00     74141

Confusion Matrix:
[[65734   106]
 [  218  8083]]
F1 Score: 0.980351728320194


In [51]:
# Section 7: Make predictions on the test dataset
test_predictions = m2.predict(test_data)

# Section 8: Create a submission file
submission = pd.DataFrame({'id': sample_submission['id'], 'is_fraud': test_predictions})
submission.to_csv('submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
