In [None]:
# Run this cell to Download dataset
!gdown 18eEEtko63fth0HEIFdxEdbjR4DfXiPa7

Downloading...
From: https://drive.google.com/uc?id=18eEEtko63fth0HEIFdxEdbjR4DfXiPa7
To: /content/processed_purchase_propensity_data_v1.parquet
100% 58.0M/58.0M [00:00<00:00, 197MB/s]


In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("./processed_purchase_propensity_data_v1.parquet")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2933439 entries, 0 to 2933438
Data columns (total 11 columns):
 #   Column                Dtype         
---  ------                -----         
 0   user_id               int64         
 1   product_id            int64         
 2   event_timestamp       datetime64[ns]
 3   created_timestamp     datetime64[us]
 4   category_code_level1  object        
 5   category_code_level2  object        
 6   brand                 object        
 7   event_weekday         int64         
 8   price                 float64       
 9   activity_count        int64         
 10  is_purchased          int64         
dtypes: datetime64[ns](1), datetime64[us](1), float64(1), int64(5), object(3)
memory usage: 246.2+ MB


In [None]:
df

Unnamed: 0,user_id,product_id,event_timestamp,created_timestamp,category_code_level1,category_code_level2,brand,event_weekday,price,activity_count,is_purchased
0,515903856,2601552,2019-11-17 00:11:39,2026-01-18 22:17:22.150556,unknown,unknown,gorenje,6,486.24,6,0
1,516301799,12702930,2019-11-12 15:40:15,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,2,0
2,516301799,12702930,2019-11-12 15:41:46,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,6,0
3,516301799,12702930,2019-11-12 15:42:05,2026-01-18 22:17:22.150556,unknown,unknown,cordiant,1,35.78,8,0
4,561066382,3800966,2019-11-15 23:36:25,2026-01-18 22:17:22.150556,appliances,iron,elenberg,4,20.57,2,0
...,...,...,...,...,...,...,...,...,...,...,...
2933434,569255486,3300427,2019-11-21 14:21:54,2026-01-18 22:17:22.150556,unknown,unknown,redmond,3,142.84,27,0
2933435,569255486,3300427,2019-11-21 14:30:14,2026-01-18 22:17:22.150556,unknown,unknown,redmond,3,142.84,29,0
2933436,569255486,3300427,2019-11-21 14:30:53,2026-01-18 22:17:22.150556,unknown,unknown,redmond,3,142.84,31,0
2933437,569255486,3300427,2019-11-21 14:32:14,2026-01-18 22:17:22.150556,unknown,unknown,redmond,3,142.84,35,1


In [None]:
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, precision_recall_curve
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from typing import Tuple, Dict, Any

### GLOBAL CONFIGURATION


In [None]:
CONFIG = {
    "file_path": "/content/processed_purchase_propensity_data_v1.parquet",
    "target_column": "is_purchased",
    "output_file": "RandomForest_final_result.json",
    "model_path": "RandomForest_model_final.pkl",
    "test_size": 0.2,
    "random_state": 89,
    "drop_columns": ['user_id', 'product_id', 'event_id'],
    "model_params": {
      'n_estimators': 200,
      'max_depth': 7,
      'n_jobs': -1,
      'max_features': 0.8,
      'max_samples': 0.8,
      'bootstrap': True,
      'criterion': 'gini',
      'random_state': 89
  }
}

### Data Loading

In [None]:
def load_data(file_path: str) -> pd.DataFrame:
  print(f"[INFO] Loading data from: {file_path}")
  try:
    df = pd.read_parquet(file_path)
    return df
  except FileNotFoundError:
    raise FileNotFoundError(f"[ERROR] File not found at {file_path}")

### Feature Engineering

In [None]:
def final_FE(df: pd.DataFrame) -> pd.DataFrame:
  df_fe = df.copy()

  # log-transform
  df_fe['price_log'] = np.log1p(df_fe['price'])
  df_fe['activity_log'] = np.log1p(df_fe['activity_count'])

  # datetime features
  df_fe['event_ts'] = pd.to_datetime(df_fe['event_timestamp'])
  df_fe['event_hour'] = df_fe['event_ts'].dt.hour
  df_fe['is_weekend'] = (df_fe['event_ts'].dt.weekday >= 5).astype(int)

  # user-level stats
  user_stats = df_fe.groupby('user_id').agg(
      user_total_events=('is_purchased', 'count'),
      user_purchase_rate=('is_purchased', 'mean')
  ).reset_index()

  df_fe = df_fe.merge(user_stats, on='user_id', how='left')

  # product-level stats
  product_stats = df_fe.groupby('product_id').agg(
      product_total_events=('is_purchased', 'count'),
      product_purchase_rate=('is_purchased', 'mean')
  ).reset_index()

  df_fe = df_fe.merge(product_stats, on='product_id', how='left')

  # log-transform counts
  df_fe['user_total_log'] = np.log1p(df_fe['user_total_events'])
  df_fe['product_total_log'] = np.log1p(df_fe['product_total_events'])

  # recency features
  df_fe['hours_since_event'] = (
      df_fe['event_ts'].max() - df_fe['event_ts']
  ).dt.total_seconds() / 3600

  df_fe['hours_log'] = np.log1p(df_fe['hours_since_event'])

  return df_fe


In [None]:
def drop_non_numeric_columns(df: pd.DataFrame) -> pd.DataFrame:
  return df.select_dtypes(include=['int', 'float', 'bool'])


### Pipeline Construction

In [None]:
def build_pipeline(numeric_cols: list, categorical_cols: list, model_params: Dict) -> ImbPipeline:
  """
  Constructs the ML pipeline: Preprocessing -> SMOTE -> XGBoost.
  """

  # Define Full Pipeline
  pipeline = ImbPipeline(steps=[
      ('classifier', RandomForestClassifier(**model_params))
  ])

  return pipeline


### Trainning Execution

In [None]:
def train_model(df: pd.DataFrame, target_col: str) -> Tuple[Any, Any, Any]:
  """
  Splits data, builds pipeline, and fits the model.
  """
  print("[PROCESS] Splitting data and training model...")

  X = df.drop(columns=[target_col])
  y = df[target_col]

  # Split Data
  X_train, X_test, y_train, y_test = train_test_split(
      X, y,
      test_size=CONFIG["test_size"],
      random_state=CONFIG["random_state"],
      stratify=y
  )

  # Identify Column Types
  num_cols = X.select_dtypes(include=['number']).columns.tolist()
  cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

  print(f"[INFO] Numeric Features: {len(num_cols)}")
  print(f"[INFO] Categorical Features: {len(cat_cols)}")

  # Build and Fit Pipeline
  pipeline = build_pipeline(num_cols, cat_cols, CONFIG["model_params"])
  pipeline.fit(X_train, y_train)

  return pipeline, X_test, y_test

### Optimization and Evaluation

In [None]:
def find_optimal_threshold(pipeline, X_test, y_test) -> float:
  """
  Finds the threshold that maximizes F1-Score.
  """
  print("[PROCESS] Tuning decision threshold...")
  probs = pipeline.predict_proba(X_test)[:, 1]
  precisions, recalls, thresholds = precision_recall_curve(y_test, probs)

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = np.nan_to_num(f1_scores)

  best_idx = np.argmax(f1_scores)
  best_thresh = thresholds[best_idx]

  print(f"[INFO] Optimal Threshold found: {best_thresh:.4f} (Max F1: {f1_scores[best_idx]:.4f})")
  return best_thresh

### Calculate Metrics

In [None]:
def calculate_metrics(pipeline, X_test, y_test, threshold: float) -> Dict[str, float]:
  """
  Calculates final metrics based on the selected threshold.
  """
  probs = pipeline.predict_proba(X_test)[:, 1]
  preds = (probs >= threshold).astype(int)

  return {
      "accuracy": accuracy_score(y_test, preds),
      "f1": f1_score(y_test, preds),
      "recall": recall_score(y_test, preds),
      "precision": precision_score(y_test, preds),
      "roc_auc": roc_auc_score(y_test, probs)
  }


### Save File

In [None]:

def save_artifacts(pipeline, metrics: Dict, params: Dict):
  """
  Saves the model and results to disk.
  """
  # Save Model
  joblib.dump(pipeline, CONFIG["model_path"])
  print(f"[SAVE] Model saved to: {CONFIG['model_path']}")

  # Save Metrics
  output_data = {
      "metrics": metrics,
      "params": params,
      "config": {k:v for k,v in CONFIG.items() if k != "model_params"}
  }
  with open(CONFIG["output_file"], "w", encoding="utf-8") as f:
      json.dump(output_data, f, indent=4, ensure_ascii=False)
  print(f"[SAVE] Metrics saved to: {CONFIG['output_file']}")

### Main Controller

In [None]:
def main():
  # Load Data
  df = load_data(CONFIG["file_path"])

  df = final_FE(df)
  df = drop_non_numeric_columns(df)

  # Train Model
  pipeline, X_test, y_test = train_model(df, CONFIG["target_column"])

  # Optimize Threshold
  best_threshold = find_optimal_threshold(pipeline, X_test, y_test)

  # Evaluate
  metrics = calculate_metrics(pipeline, X_test, y_test, best_threshold)

  # Report & Save
  print("\n" + "="*40)
  print("FINAL RESULTS (Optimized)")
  print("-" * 40)
  for k, v in metrics.items():
    print(f"{k.ljust(15)}: {v:.4f}")
  print("="*40)

  final_params = CONFIG["model_params"].copy()
  final_params['optimal_threshold'] = float(best_threshold)
  save_artifacts(pipeline, metrics, final_params)

if __name__ == "__main__":
  main()

[INFO] Loading data from: /content/processed_purchase_propensity_data_v1.parquet
[PROCESS] Splitting data and training model...
[INFO] Numeric Features: 17
[INFO] Categorical Features: 0
[PROCESS] Tuning decision threshold...
[INFO] Optimal Threshold found: 0.3808 (Max F1: 0.7379)

FINAL RESULTS (Optimized)
----------------------------------------
accuracy       : 0.8487
f1             : 0.7379
recall         : 0.8184
precision      : 0.6718
roc_auc        : 0.9316
[SAVE] Model saved to: RandomForest_model_final.pkl
[SAVE] Metrics saved to: RandomForest_final_result.json


### Read file json and joblib

In [None]:
import json
import joblib

# Read JSON file
json_file_path = "RandomForest_final_result.json"
with open(json_file_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)
print(f"Content of {json_file_path}:\n{json.dumps(json_data, indent=4, ensure_ascii=False)}")

print("\n" + "="*50 + "\n")

# Read PKL file (model)
pkl_file_path = "RandomForest_model_final.pkl"
loaded_model = joblib.load(pkl_file_path)
print(f"Successfully loaded model from {pkl_file_path}:\n{loaded_model}")

Content of RandomForest_final_result.json:
{
    "metrics": {
        "accuracy": 0.8486844796552853,
        "f1": 0.7378647083661833,
        "recall": 0.8184021432267615,
        "precision": 0.6717582287600675,
        "roc_auc": 0.9316426728477253
    },
    "params": {
        "n_estimators": 200,
        "max_depth": 7,
        "n_jobs": -1,
        "max_features": 0.8,
        "max_samples": 0.8,
        "bootstrap": true,
        "criterion": "gini",
        "random_state": 89,
        "optimal_threshold": 0.38082441268314793
    },
    "config": {
        "file_path": "/content/processed_purchase_propensity_data_v1.parquet",
        "target_column": "is_purchased",
        "output_file": "RandomForest_final_result.json",
        "model_path": "RandomForest_model_final.pkl",
        "test_size": 0.2,
        "random_state": 89,
        "drop_columns": [
            "user_id",
            "product_id",
            "event_id"
        ]
    }
}


Successfully loaded model from Ra