## Creating a Model

In [1]:
import pandas as pd

# df = pd.read_parquet("../0 - Data/5 - pcas/ft_strategy_1_pca.pq")
df = pd.read_parquet("../0 - Data/4 - scaled/ft_strategy_2_scaled.pq")

df.head()

Unnamed: 0,Amount,MCC,Has Chip,Cards Issued,Credit Limit,Current Age,Retirement Age,Latitude,Longitude,Per Capita Income - Zipcode,...,Use Chip_Online Transaction,Use Chip_Swipe Transaction,Card Type_Debit,Card Type_Debit (Prepaid),Gender_Male,Age Group_26-35,Age Group_36-45,Age Group_46-60,Age Group_60+,Is Fraud
6780,1.025128,-0.028966,0.337145,0.963448,0.738314,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,...,False,True,True,False,False,False,False,True,False,0
6781,1.025128,-0.028966,0.337145,0.963448,0.551106,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,...,False,True,True,False,False,False,False,True,False,0
6782,1.025128,-0.028966,0.337145,0.963448,2.517794,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,...,False,True,True,False,False,False,False,True,False,0
6783,1.025128,-0.028966,-2.966079,-0.967922,-0.218642,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,...,False,True,False,False,False,False,False,True,False,0
6784,1.025128,-0.028966,0.337145,-0.967922,-1.213973,-0.09811,-0.185577,-0.631234,-1.563208,0.447753,...,False,True,False,True,False,False,False,True,False,0


#### Split the data into training and test

In [2]:
df['Datetime'] = pd.to_datetime(df['Datetime'])

split_date = '2019-10-01 00:00:00'

train_data = df[df['Datetime'] < split_date]
test_data = df[df['Datetime'] >= split_date]

#### Drop the Datetime Column

In [3]:
train_data = train_data.drop(["Datetime"], axis=1)
test_data = test_data.drop(["Datetime"], axis=1)

#### Create X and y Vectors

In [4]:
target_column = 'Is Fraud'

# Separate features and target for the training set
X_train = train_data.drop(target_column, axis=1)
y_train = train_data[target_column]

# Separate features and target for the test set
X_test = test_data.drop(target_column, axis=1)
y_test = test_data[target_column]

#### Train the Model

In [5]:
from catboost import CatBoostClassifier


params = {
    "iterations": 2000,  # Increased for better exploration of patterns
    "learning_rate": 0.03,  # Reduced for finer adjustments per iteration
    "depth": 8,  # Keep moderate depth, adjust based on tests
    "loss_function": 'Logloss',
    "eval_metric": 'AUC',
    "verbose": 100,  # Print progress for better tracking
    "random_state": 42,
    "task_type": 'GPU',
    "devices": '0',
    "class_weights": [1, 10],  # Adjusted if true ratio differs
    "early_stopping_rounds": 100,  # Stop early if no improvement
    "l2_leaf_reg": 3,  # Regularization to prevent overfitting
}

# Initialize the CatBoost model
model = CatBoostClassifier(**params)

# Train the model
model.fit(X_train, y_train)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 202ms	remaining: 6m 44s
100:	total: 17.6s	remaining: 5m 31s
200:	total: 34.7s	remaining: 5m 10s
300:	total: 51.9s	remaining: 4m 53s
400:	total: 1m 9s	remaining: 4m 35s
500:	total: 1m 26s	remaining: 4m 18s
600:	total: 1m 43s	remaining: 4m 1s
700:	total: 2m 1s	remaining: 3m 44s
800:	total: 2m 18s	remaining: 3m 27s
900:	total: 2m 35s	remaining: 3m 10s
1000:	total: 2m 53s	remaining: 2m 53s
1100:	total: 3m 11s	remaining: 2m 36s
1200:	total: 3m 29s	remaining: 2m 19s
1300:	total: 3m 47s	remaining: 2m 2s
1400:	total: 4m 5s	remaining: 1m 44s
1500:	total: 4m 23s	remaining: 1m 27s
1600:	total: 4m 41s	remaining: 1m 10s
1700:	total: 4m 59s	remaining: 52.7s
1800:	total: 5m 18s	remaining: 35.2s
1900:	total: 5m 36s	remaining: 17.5s
1999:	total: 5m 55s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x2531ae79a30>

#### Model Predictions

In [6]:
# Make probability predictions
y_pred_proba = model.predict(X_test)

# Convert probabilities to binary predictions (0 or 1) based on a threshold, usually 0.5
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_proba]

#### Test the Predictions

In [7]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9992621276541609


In [8]:
from sklearn.metrics import classification_report

data_label = "ft-tstrategy-2"  # Update this label for each dataset

# Calculate the classification report
report = classification_report(y_test, y_pred, output_dict=True)

# Extract metrics for class 0 and class 1
metrics = {
    'data_label': [data_label],
    'precision_0': [report['0']['precision']],
    'recall_0': [report['0']['recall']],
    'f1_score_0': [report['0']['f1-score']],
    'precision_1': [report['1']['precision']],
    'recall_1': [report['1']['recall']],
    'f1_score_1': [report['1']['f1-score']]
}

# Convert metrics to DataFrame
metrics_df = pd.DataFrame(metrics)

# Append to CSV file (or create if it doesn't exist)
output_file = "CATBoost_metrics.csv"
try:
    # Append if file exists
    metrics_df.to_csv(output_file, mode='a', index=False, header=not pd.read_csv(output_file).empty)
except FileNotFoundError:
    # Create file if it doesn't exist
    metrics_df.to_csv(output_file, index=False)

print(f"Metrics for {data_label} saved to {output_file}.")

Metrics for ft-tstrategy-2 saved to CATBoost_metrics.csv.
