## Creating a Model

In [1]:
import pandas as pd

# df = pd.read_parquet("../0 - Data/5 - pcas/ft_strategy_2_pca.pq")
df = pd.read_parquet("../0 - Data/4 - scaled/ft_strategy_2_scaled.pq")

# df.head()

#### Split the data into training and test

In [2]:
df['Datetime'] = pd.to_datetime(df['Datetime'])

split_date = '2019-10-01 00:00:00'

train_data = df[df['Datetime'] < split_date]
test_data = df[df['Datetime'] >= split_date]

#### Drop the Datetime Column

In [3]:
train_data = train_data.drop(["Datetime"], axis=1)
test_data = test_data.drop(["Datetime"], axis=1)

#### Create X and y Vectors

In [4]:
target_column = 'Is Fraud'

# Separate features and target for the training set
X_train = train_data.drop(target_column, axis=1)
y_train = train_data[target_column]

# Separate features and target for the test set
X_test = test_data.drop(target_column, axis=1)
y_test = test_data[target_column]

#### Train the Model

In [5]:
import xgboost as xgb

# Convert your dataset to DMatrix, XGBoost's internal data structure
dtrain = xgb.DMatrix(X_train, label=y_train)

# Set parameters for Random Forest using GPU
params = {
    'objective': 'binary:logistic',     # Adjust for your task (e.g., classification or regression)
    'tree_method': 'hist',              # Use histogram-based algorithm
    'max_depth': 6,                     # Depth of trees
    'device': 'cuda'                    # Select the GPU
}

# Train the Random Forest using XGBoost in RF mode
model = xgb.train(params, dtrain, num_boost_round=100)

#### Convert the Test Data into DMatrix for XGBoost

In [6]:
# Convert the test set to DMatrix
dtest = xgb.DMatrix(X_test)

# Make probability predictions
y_pred_proba = model.predict(dtest)

# Convert probabilities to binary predictions (0 or 1) based on a threshold, usually 0.5
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_proba]

#### Test the Predictions

In [7]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9995753118090632


In [8]:
from sklearn.metrics import classification_report

data_label = "ft-tstrategy-2"  # Update this label for each dataset

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Extract metrics for class 0 and class 1
metrics = {
    'data_label': [data_label],
    'precision_0': [report['0']['precision']],
    'recall_0': [report['0']['recall']],
    'f1_score_0': [report['0']['f1-score']],
    'precision_1': [report['1']['precision']],
    'recall_1': [report['1']['recall']],
    'f1_score_1': [report['1']['f1-score']]
}

# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics)

# Append to CSV file (or create if it doesn't exist)
output_file = "XGBoost_metrics.csv"
try:
    # Append if file exists
    metrics_df.to_csv(output_file, mode='a', index=False, header=not pd.read_csv(output_file).empty)
except FileNotFoundError:
    # Create file if it doesn't exist
    metrics_df.to_csv(output_file, index=False)

print(f"Metrics for {data_label} saved to {output_file}.")

Metrics for ft-tstrategy-2-pca-dim-30 saved to XGBoost_metrics.csv.
