In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

train = pd.read_csv('fraudTrain.csv')
test = pd.read_csv('fraudTest.csv')

#combine
df = pd.concat([train, test], axis=0, ignore_index=True)
print(df.head())

# drop cols
drop_cols = ["Unnamed: 0", "trans_num", "first", "last", "street", "cc_num", "unix_time"]
df.drop(columns=drop_cols, inplace=True, errors="ignore")

# Convert datetime features
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["dob"] = pd.to_datetime(df["dob"])
df["hour"] = df["trans_date_trans_time"].dt.hour
df["day"] = df["trans_date_trans_time"].dt.day
df["weekday"] = df["trans_date_trans_time"].dt.weekday
df["month"] = df["trans_date_trans_time"].dt.month
df["age"] = df["trans_date_trans_time"].dt.year - df["dob"].dt.year

df.drop(columns=["dob", "trans_date_trans_time"], inplace=True)

# distance between transaction location and merchant location
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

df["distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])
#df.drop(columns=["lat", "long", "merch_lat", "merch_long"], inplace=True)

df['amt'] = np.log1p(df['amt'])
df['city_pop'] = np.log1p(df['city_pop'])


# Categorical Encoding Strategy
categorical_cols = ["merchant", "category", "gender", "state", "job", "city"]

# Define encoding methods per feature
one_hot_cols = ["category", "gender", "state"]
target_cols = ["merchant", "job"]
freq_cols = ["city"]

# One-Hot Encoding
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_array = ohe.fit_transform(df[one_hot_cols])
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(one_hot_cols))

df = df.drop(columns=one_hot_cols).reset_index(drop=True)
df = pd.concat([df, encoded_df], axis=1)

# Target Encoding (Fraud Rate Per Category)
encoders = {}
for col in target_cols:
    encoders[col] = df.groupby(col)["is_fraud"].mean()
    df[col] = df[col].map(encoders[col]).fillna(df["is_fraud"].mean())  # Unseen categories get global fraud rate

# Frequency Encoding
for col in freq_cols:
    encoders[col] = df[col].value_counts(normalize=True)
    df[col] = df[col].map(encoders[col]).fillna(0)



num_features = ['amt', 'city_pop', 'age', 'hour', 'day', 'month', 'weekday','distance', 'lat' , 'long', 'merch_lat', 'merch_long']

# Initialize and fit the scaler on these numeric features.
scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

# Separate the target variable from the features.
target = 'is_fraud'
X = df.drop(columns=[target])
y = df[target]


# 80-20 split for train test
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, 
                                                            stratify=y, random_state=42)

# 15-65 split for train validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, 
                                                  test_size=0.1875, stratify=y_train_val, random_state=42)

import joblib

# Save the processed splits to CSV files.
X_train.to_csv('data_withoutembeddings/X_train.csv', index=False)
y_train.to_csv('data_withoutembeddings/y_train.csv', index=False)

X_val.to_csv('data_withoutembeddings/X_val.csv', index=False)
y_val.to_csv('data_withoutembeddings/y_val.csv', index=False)

X_test.to_csv('data_withoutembeddings/X_test.csv', index=False)
y_test.to_csv('data_withoutembeddings/y_test.csv', index=False)

# Also, save the scaler object for later transformation during inference.
joblib.dump(scaler, 'data_withoutembeddings/standard_scaler.pkl')

print("Processed data and scaler saved successfully.")

   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2019-01-01 00:00:18  2703186189652095   
1           1   2019-01-01 00:00:44      630423337322   
2           2   2019-01-01 00:00:51    38859492057661   
3           3   2019-01-01 00:01:16  3534093764340240   
4           4   2019-01-01 00:03:06   375534208663984   

                             merchant       category     amt      first  \
0          fraud_Rippin, Kub and Mann       misc_net    4.97   Jennifer   
1     fraud_Heller, Gutmann and Zieme    grocery_pos  107.23  Stephanie   
2                fraud_Lind-Buckridge  entertainment  220.11     Edward   
3  fraud_Kutch, Hermiston and Farrell  gas_transport   45.00     Jeremy   
4                 fraud_Keeling-Crist       misc_pos   41.96      Tyler   

      last gender                        street  ...      lat      long  \
0    Banks      F                561 Perry Cove  ...  36.0788  -81.1781   
1     Gill      F  43039 Riley Greens Suite 393  ...  48

In [1]:
X_train.head()

NameError: name 'X_train' is not defined