In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
df = pd.read_csv("./df_processed_fe_optimized_v2.csv")
df.shape

(2929997, 31)

In [3]:
df.head()

Unnamed: 0,user_id,product_id,event_timestamp,created_timestamp,category_code_level1,category_code_level2,brand,event_weekday,price,activity_count,...,product_total_views,product_total_carts,product_total_purchases,product_view_to_cart_rate,product_cart_to_purchase_rate,product_unique_buyers,brand_purchase_rate,price_vs_user_avg,price_vs_category_avg,is_purchased
0,94566147,1005007,2019-11-12 15:04:08,2026-01-18 22:17:22.150556,electronics,smartphone,xiaomi,1,93.78,3,...,26505,2589,654,0.09768,0.252607,548,0.262642,1.0,0.221107,0
1,176495092,6301929,2019-11-08 14:01:42,2026-01-18 22:17:22.150556,appliances,kitchen,polaris,4,28.31,3,...,144,3,1,0.020833,0.333333,1,0.303095,1.0,0.120976,0
2,239198635,1003942,2019-11-09 15:29:59,2026-01-18 22:17:22.150556,electronics,smartphone,xiaomi,5,187.24,3,...,6618,84,22,0.012693,0.261905,19,0.262642,1.0,0.441459,0
3,239198635,1003942,2019-11-09 15:30:54,2026-01-18 22:17:22.150556,electronics,smartphone,xiaomi,5,187.24,5,...,6618,84,22,0.012693,0.261905,19,0.262642,1.0,0.441459,0
4,269003139,6000032,2019-11-26 14:38:48,2026-01-18 22:17:22.150556,auto,accessories,cenmax,1,66.39,12,...,9319,291,84,0.031227,0.28866,72,0.291525,1.0,0.443217,0


In [4]:
df.drop(columns=['user_id', 'product_id', 'event_timestamp', 'created_timestamp'], axis=1, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2929997 entries, 0 to 2929996
Data columns (total 27 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   category_code_level1           object 
 1   category_code_level2           object 
 2   brand                          object 
 3   event_weekday                  int64  
 4   price                          float64
 5   activity_count                 int64  
 6   event_hour                     int64  
 7   user_total_events              int64  
 8   user_total_views               int64  
 9   user_total_carts               int64  
 10  user_total_purchases           int64  
 11  user_view_to_cart_rate         float64
 12  user_cart_to_purchase_rate     float64
 13  user_avg_purchase_price        float64
 14  user_unique_products           int64  
 15  user_unique_categories         int64  
 16  product_total_events           int64  
 17  product_total_views            int64  
 18  pr

In [6]:
df.describe()

Unnamed: 0,event_weekday,price,activity_count,event_hour,user_total_events,user_total_views,user_total_carts,user_total_purchases,user_view_to_cart_rate,user_cart_to_purchase_rate,...,product_total_views,product_total_carts,product_total_purchases,product_view_to_cart_rate,product_cart_to_purchase_rate,product_unique_buyers,brand_purchase_rate,price_vs_user_avg,price_vs_category_avg,is_purchased
count,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,...,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0,2929997.0
mean,3.77729,290.4006,8.27971,13.83359,98.07697,82.70603,11.70834,3.6626,0.3009591,0.2914533,...,87416.12,10269.87,3570.683,0.07702363,0.3021128,2466.975,0.3027488,1.24451,0.8686448,0.2593491
std,1.890479,335.6997,11.47958,6.63273,143.4868,128.1553,22.86118,12.323,0.8243151,0.3547166,...,144622.9,19674.46,7211.095,0.04362758,0.1181646,4830.601,0.04534799,3.467046,0.9155397,0.4382775
min,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.000315557,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,74.39,2.0,11.0,23.0,17.0,3.0,0.0,0.07179487,0.0,...,2509.0,112.0,30.0,0.04456947,0.25,25.0,0.2707714,1.0,0.2970721,0.0
50%,4.0,168.7,5.0,15.0,53.0,43.0,6.0,1.0,0.1578947,0.1818182,...,17149.0,1072.0,320.0,0.07191781,0.307311,261.0,0.3094446,1.0,0.5340461,0.0
75%,5.0,349.64,9.0,19.0,117.0,98.0,12.0,3.0,0.3333333,0.5,...,104987.0,9035.0,2600.0,0.1045576,0.3502199,1912.0,0.3349302,1.0,1.147641,1.0
max,6.0,2574.07,418.0,23.0,22929.0,22926.0,719.0,519.0,73.0,27.0,...,583010.0,81205.0,32321.0,2.116279,22.0,21469.0,3.0,745.0127,22.4903,1.0


In [7]:
df.describe(include='object')

Unnamed: 0,category_code_level1,category_code_level2,brand
count,2929997,2929997,2929997
unique,14,58,3058
top,electronics,smartphone,samsung
freq,1473816,1119378,572279


In [8]:
df.dropna(inplace=True)

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
# Numerical features (original + new)
NUMERICAL_FEATURES = [
    # Original
    "price",
    "activity_count",
    "event_weekday",
    # New: Hour
    "event_hour",
    # New: User features
    "user_total_events",
    "user_total_views",
    "user_total_carts",
    "user_total_purchases",
    "user_view_to_cart_rate",
    "user_cart_to_purchase_rate",
    "user_avg_purchase_price",
    "user_unique_products",
    "user_unique_categories",
    # New: Product features
    "product_total_events",
    "product_total_views",
    "product_total_carts",
    "product_total_purchases",
    "product_view_to_cart_rate",
    "product_cart_to_purchase_rate",
    "product_unique_buyers",
    # New: Brand & Price comparison
    "brand_purchase_rate",
    "price_vs_user_avg",
    "price_vs_category_avg",
]

CATEGORICAL_FEATURES = ["brand", "category_code_level1", "category_code_level2"]
TARGET = "is_purchased"
ALL_FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES

print(f"Numerical features: {len(NUMERICAL_FEATURES)}")
print(f"Categorical features: {len(CATEGORICAL_FEATURES)}")
print(f"Total features: {len(ALL_FEATURES)}")

print("\nNumerical features list:")
for i, f in enumerate(NUMERICAL_FEATURES, 1):
    print(f"  {i}. {f}")

Numerical features: 23
Categorical features: 3
Total features: 26

Numerical features list:
  1. price
  2. activity_count
  3. event_weekday
  4. event_hour
  5. user_total_events
  6. user_total_views
  7. user_total_carts
  8. user_total_purchases
  9. user_view_to_cart_rate
  10. user_cart_to_purchase_rate
  11. user_avg_purchase_price
  12. user_unique_products
  13. user_unique_categories
  14. product_total_events
  15. product_total_views
  16. product_total_carts
  17. product_total_purchases
  18. product_view_to_cart_rate
  19. product_cart_to_purchase_rate
  20. product_unique_buyers
  21. brand_purchase_rate
  22. price_vs_user_avg
  23. price_vs_category_avg


In [11]:
# Prepare X and y
X = df[ALL_FEATURES].copy()
y = df[TARGET].copy()

# Convert categorical columns to string type
for col in CATEGORICAL_FEATURES:
    X[col] = X[col].astype(str)

# Fill any remaining nulls
X = X.fillna(0)

print(f"\nTarget distribution:")
print(f"  Class 0 (Not Purchased): {(y == 0).sum():,} ({(y == 0).mean() * 100:.2f}%)")
print(f"  Class 1 (Purchased):     {(y == 1).sum():,} ({(y == 1).mean() * 100:.2f}%)")


Target distribution:
  Class 0 (Not Purchased): 2,137,553 (73.95%)
  Class 1 (Purchased):     753,162 (26.05%)


In [12]:
# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), NUMERICAL_FEATURES),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore", sparse_output=False, max_categories=100
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

In [13]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2312572, 26)
(2312572,)
(578143, 26)
(578143,)


In [14]:
X_train_arr = preprocessor.fit_transform(X_train)   
X_test_arr  = preprocessor.transform(X_test)

# Lấy tên feature sau transform
feature_names = preprocessor.get_feature_names_out()

# Convert sang DataFrame (giữ index để concat khớp hàng)
X_train_df = pd.DataFrame(X_train_arr, columns=feature_names, index=X_train.index)
X_test_df  = pd.DataFrame(X_test_arr,  columns=feature_names, index=X_test.index)

# y_train/y_test nên là Series có cùng index
y_train_s = pd.Series(y_train, name="target", index=X_train.index)
y_test_s  = pd.Series(y_test,  name="target", index=X_test.index)

train_df = pd.concat([X_train_df, y_train_s], axis=1)
test_df  = pd.concat([X_test_df,  y_test_s], axis=1)

In [15]:
train_df.to_csv("train.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (2312572, 196)
Test shape: (578143, 196)
