In [6]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# --- Configuration ---
FILE_PATH = 'index_1.csv'
RANDOM_SEED = 42

# --- 1. Data Loading ---
try:
    # Load the transactional coffee data
    df = pd.read_csv(FILE_PATH)
except FileNotFoundError:
    print(f"ERROR: File '{FILE_PATH}' not found.")
    exit()

# Display basic info
print(f"--- Coffee Transaction Data Loaded: {FILE_PATH} ---")
print(f"Dataset shape: {df.shape}")

# --- 2. Feature Engineering and Cleaning ---

# Convert datetime column to the correct format and extract features
df['datetime'] = pd.to_datetime(df['datetime'])

# 2a. Target Selection: We are predicting the transaction amount ('money')
TARGET_COL = 'money'
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

# 2b. Feature Engineering: Create time-based features from 'datetime'
X['transaction_hour'] = X['datetime'].dt.hour
X['day_of_week'] = X['datetime'].dt.day_name() # Categorical: Monday, Tuesday, etc.
X['is_weekend'] = X['datetime'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0) # Numerical/Binary

# 2c. Drop non-predictive/redundant columns
# 'datetime' is now redundant after feature extraction
# 'date' is redundant
# 'card' (Customer ID) is high-cardinality, might be noise, but CatBoost handles it well, so we'll keep it as a category.
X = X.drop(['datetime', 'date'], axis=1)

# --- 3. Identify and Clean Categorical Features ---

# Columns that are strings/objects are automatically categorical
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Ensure all categorical features are treated as strings and handle potential NaNs
for col in cat_features:
    # Fill any NaN in categorical columns with a string marker
    X[col] = X[col].fillna('MISSING_CAT').astype(str)

print(f"\nCategorical features used: {cat_features}")
print(f"Numerical features used: {X.select_dtypes(include=['number']).columns.tolist()}")
print(f"Target variable: '{TARGET_COL}' (Continuous Regression)")

# --- 4. Data Splitting ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


# --- 5. Model Initialization and Training (CatBoost Regressor) ---

# We MUST use CatBoostRegressor since we are aiming for R² comparison
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='RMSE', # Internal loss is RMSE, but we evaluate using R²
    cat_features=cat_features,
    verbose=0, # Set to 0 to hide training output
    random_state=RANDOM_SEED,
    allow_writing_files=False
)

print("\nStarting CatBoost Regressor Training...")
model.fit(
    X_train, y_train,
    # No eval_set needed if we set verbose=0, simplifies output
)
print("Training finished.")


# --- 6. Prediction and Evaluation ---

# Get continuous predictions for the 'money' amount
y_pred = model.predict(X_test)

# Calculate REGRESSION metrics (R² and MSE)
# This R² score is the correct, comparable metric for your DOFEN analysis.
r2_score_value = r2_score(y_test, y_pred)
mse_value = mean_squared_error(y_test, y_pred)

print("\n--- Model Performance Metrics (Predicting Transaction Amount) ---")
print(f"R-squared (R² Score): {r2_score_value:.4f}")
print(f"Mean Squared Error (MSE): {mse_value:.4f}")

# --- 7. Feature Importance ---

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features for Predicting Transaction Amount:")
print(feature_importance.head(10).to_markdown(index=False))

print("\n--- Next Step for DOFEN Comparison ---")
print("This R² score is now a valid, continuous regression metric.")
print("The only thing that can lead to 'off' R² values now is poor model fit,")
print("which often happens when the features do not explain the target (e.g., price is fixed for each 'coffee_name').")
print("You must ensure your DOFEN model is also predicting the 'money' column.")

--- Coffee Transaction Data Loaded: index_1.csv ---
Dataset shape: (3636, 6)

Categorical features used: ['cash_type', 'card', 'coffee_name', 'day_of_week']
Numerical features used: ['transaction_hour', 'is_weekend']
Target variable: 'money' (Continuous Regression)

Training set size: 2908 samples
Test set size: 728 samples

Starting CatBoost Regressor Training...
Training finished.

--- Model Performance Metrics (Predicting Transaction Amount) ---
R-squared (R² Score): 0.8835
Mean Squared Error (MSE): 2.8585

Top 10 Most Important Features for Predicting Transaction Amount:
| feature          |   importance |
|:-----------------|-------------:|
| coffee_name      |    77.8317   |
| day_of_week      |     6.79824  |
| card             |     5.76379  |
| transaction_hour |     5.18435  |
| cash_type        |     4.19021  |
| is_weekend       |     0.231697 |

--- Next Step for DOFEN Comparison ---
This R² score is now a valid, continuous regression metric.
The only thing that can lead t