In [16]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report

In [17]:
file_path = "fraudTrain.csv"  
df = pd.read_csv(file_path)

In [18]:
# drop cols
df.drop(columns=["Unnamed: 0", "trans_num", "first", "last", "street"], inplace=True)

# convert time features
df["trans_date_trans_time"] = pd.to_datetime(df["trans_date_trans_time"])
df["hour"] = df["trans_date_trans_time"].dt.hour
df["day"] = df["trans_date_trans_time"].dt.day
df["weekday"] = df["trans_date_trans_time"].dt.weekday
df["month"] = df["trans_date_trans_time"].dt.month

# age
df["dob"] = pd.to_datetime(df["dob"])
df["age"] = df["trans_date_trans_time"].dt.year - df["dob"].dt.year

# drop unneccessary cols
df.drop(columns=["dob"], inplace=True)
df.drop(columns=["trans_date_trans_time"], inplace=True)
df.drop(columns=["cc_num"], inplace=True)
df.drop(columns=["unix_time"], inplace=True)

In [19]:
# distance between transaction location and merchant location
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

df["distance"] = haversine(df["lat"], df["long"], df["merch_lat"], df["merch_long"])
df.drop(columns=["lat", "long", "merch_lat", "merch_long"], inplace=True)

In [20]:

# encode
categorical_cols = ["merchant", "category", "gender", "state", "job", "city"]
label_encoders = {col: LabelEncoder() for col in categorical_cols}
for col in categorical_cols:
    df[col] = label_encoders[col].fit_transform(df[col])

In [21]:
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]

# split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# normalize
scaler = StandardScaler()
num_cols = ["amt", "city_pop", "distance", "age"]
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

In [None]:
# train
xgb_model = XGBClassifier(
    objective="binary:logistic",
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    tree_method="hist",  # Default method
    device="cuda"  # Enable GPU
)


In [None]:
# hyperparameters
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.1, 0.2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

grid_search = GridSearchCV(xgb_model, param_grid, scoring="f1", cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

Fitting 3 folds for each of 72 candidates, totalling 216 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [None]:
# evaluate on validation set
y_pred = best_model.predict(X_val)
print("Classification Report:\n", classification_report(y_val, y_pred))


print("Best Hyperparameters:", best_params)

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.98      0.86      0.92      1501

    accuracy                           1.00    259335
   macro avg       0.99      0.93      0.96    259335
weighted avg       1.00      1.00      1.00    259335

Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:
X.head()

Unnamed: 0,merchant,category,amt,gender,city,state,zip,city_pop,job,hour,day,weekday,month,age,distance
0,514,8,4.97,0,526,27,28654,3495,370,0,1,1,1,31,78.597568
1,241,4,107.23,0,612,47,99160,149,428,0,1,1,1,41,30.212176
2,390,0,220.11,1,468,13,83252,4154,307,0,1,1,1,57,108.206083
3,360,2,45.0,1,84,26,59632,1939,328,0,1,1,1,52,95.673231
4,297,9,41.96,1,216,45,24433,99,116,0,1,1,1,33,77.556744


In [None]:
# Load test dataset in chunks
file_path_test = "fraudTest.csv"
chunk_size = 10000  # Process 10,000 rows at a time
results = []

In [None]:
# Preprocessing function
def preprocess_data(chunk, label_encoders, scaler, categorical_cols, num_cols):
    chunk.drop(columns=["Unnamed: 0", "trans_num", "first", "last", "street"], inplace=True)
    chunk["trans_date_trans_time"] = pd.to_datetime(chunk["trans_date_trans_time"])
    chunk["hour"] = chunk["trans_date_trans_time"].dt.hour
    chunk["day"] = chunk["trans_date_trans_time"].dt.day
    chunk["weekday"] = chunk["trans_date_trans_time"].dt.weekday
    chunk["month"] = chunk["trans_date_trans_time"].dt.month
    chunk["dob"] = pd.to_datetime(chunk["dob"])
    chunk["age"] = chunk["trans_date_trans_time"].dt.year - chunk["dob"].dt.year
    chunk.drop(columns=["dob", "trans_date_trans_time, cc_num, unix_time"], inplace=True)
    
    def haversine(lat1, lon1, lat2, lon2):
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        return R * c
    
    chunk["distance"] = haversine(chunk["lat"], chunk["long"], chunk["merch_lat"], chunk["merch_long"])
    chunk.drop(columns=["lat", "long", "merch_lat", "merch_long"], inplace=True)
    
    for col in categorical_cols:
        chunk[col] = label_encoders[col].transform(chunk[col])
    
    chunk[num_cols] = scaler.transform(chunk[num_cols])
    return chunk

# Process test data in chunks
for chunk in pd.read_csv(file_path_test, chunksize=chunk_size):
    chunk = preprocess_data(chunk, label_encoders, scaler, categorical_cols, num_cols)
    X_test_chunk = chunk.drop(columns=["is_fraud"])
    y_test_chunk = chunk["is_fraud"]
    y_pred_chunk = best_model.predict(X_test_chunk)
    results.append(classification_report(y_test_chunk, y_pred_chunk, output_dict=True))

# Aggregate results
print("Final Test Set Performance:")
print(final_report.T)

KeyError: "['trans_date_trans_time, cc_num, unix_time'] not found in axis"

In [None]:
import xgboost as xgb
import numpy as np

# Dummy dataset
X_sample = np.random.rand(1000, 10)
y_sample = np.random.randint(0, 2, 1000)
dtrain = xgb.DMatrix(X_sample, label=y_sample)

# Set parameters for GPU training
params = {
    "objective": "binary:logistic",
    "tree_method": "hist",  # Default
    "device": "cuda"  # Enable GPU acceleration
}

# Train model
bst = xgb.train(params, dtrain, num_boost_round=10)

# ✅ If this runs without errors, GPU is working fine!
print("GPU training successful!")


GPU training successful!
