In [None]:
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '../scripts'))
from utils.load_env import PATH_DATA
from feature_engineering import FeatureEngineering

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import polars as pl # Polars for memory-efficient data processing
import gc

pl.enable_string_cache() # Enable string cache to handle categorical comparisons

pd.set_option('display.max_columns', None)

def show_df(df, polar=True):
    print(df.shape)
    if polar:
        display(pd.concat([df.head(2).to_pandas(), df.tail(1).to_pandas()]))
    else:
        display(pd.concat([df.head(2), df.tail(1)]))

## Description des données

| Column | Description |
|:-:|-|
| event_time | Time when event happened at (in UTC). |
| event_type | Type of event product_id ID of a product |
| category_id | Product's category ID |
| category_code | Product's category taxonomy (code name) if is was possible to make it. Usually present formeaningful categories and skipped for different kinds of accessories. Can be missing |
| brand | Downcased string of brand name. Can be missing. |
| price | Float price of the product. |
| user_id | Permanent user ID. |
| user_session | Temporary user's session ID. Same for each user's session. Is changed every time user come back to online store from a long pause. |

In [None]:
df = pl.read_csv(
    f"{PATH_DATA}/2019-Oct.csv.gz",
    schema_overrides={
        'user_id': pl.Categorical,
        'product_id': pl.Categorical,
        'category_id': pl.Categorical,
        'price': pl.Float32,
        'event_type': pl.Categorical,
        'category_code': pl.Categorical,
        'brand': pl.Categorical,
        'user_session': pl.Categorical,
        'event_time': pl.Utf8  # Load as string first, then parse
    },
    try_parse_dates=False,
    infer_schema_length=1000,
    n_rows=1000000
).with_columns([
    pl.col("event_time").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S %Z", strict=False).alias("event_time")
])

show_df(df)

In [None]:
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# Load all months data
def load_all_data():
    """Load all available months of data"""
    months = ['2019-Oct', '2019-Nov', '2019-Dec', '2020-Jan', '2020-Feb', '2020-Mar', '2020-Apr']
    dataframes = []
    
    for month in months:
        try:
            file_path = f"{PATH_DATA}/{month}.csv.gz"
            print(f"Loading {month}...")
            
            df_month = pl.read_csv(
                file_path,
                schema_overrides={
                    'user_id': pl.Categorical,
                    'product_id': pl.Categorical,
                    'category_id': pl.Categorical,
                    'price': pl.Float32,
                    'event_type': pl.Categorical,
                    'category_code': pl.Categorical,
                    'brand': pl.Categorical,
                    'user_session': pl.Categorical,
                    'event_time': pl.Utf8
                },
                try_parse_dates=False,
                infer_schema_length=1000
            ).with_columns([
                pl.col("event_time").str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S %Z", strict=False).alias("event_time"),
                pl.lit(month).alias("month")
            ])
            
            print(f"  - {month}: {df_month.shape[0]:,} rows")
            dataframes.append(df_month)
            
        except Exception as e:
            print(f"  - {month}: File not found or error - {e}")
    
    if dataframes:
        combined_df = pl.concat(dataframes)
        print(f"\nTotal combined data: {combined_df.shape[0]:,} rows, {combined_df.shape[1]} columns")
        return combined_df
    else:
        print("No data files found, using sample data")
        return df.with_columns([pl.lit("2019-Oct").alias("month")])

# Load all available data
df_all = load_all_data()
show_df(df_all)

## Filtrage des utilisateurs

In [None]:
# Count events per user efficiently with Polars
user_counts_df = df_all.group_by("user_id").len().sort("len", descending=True)
user_counts_series = user_counts_df.to_pandas().set_index('user_id')['len']

# Calculate retention metrics
event_distribution = user_counts_series.value_counts().sort_index()
max_events = event_distribution.index.max()
users_total = len(user_counts_series)
users_with_at_least_n = event_distribution[::-1].cumsum()[::-1]
n_events = np.arange(1, max_events + 1)
users_retained = users_with_at_least_n.reindex(n_events, method='ffill').fillna(0)
percent_users_lost = 1 - (users_retained / users_total)

print(f"Total users: {users_total:,}")

In [None]:
max_n = 10

n_display = np.arange(1, max_n + 1)
percent_display = percent_users_lost.reindex(n_display, method='ffill').fillna(0) * 100

plt.figure(figsize=(10, 6))
plt.plot(n_events, percent_users_lost*100, marker='o', linestyle='-')
plt.xlim(0.95, max_n + 0.5)
plt.xlabel("Minimum # of Events (n)")
plt.xticks(n_display)
plt.ylabel("% of Users Lost")
plt.title("Users Lost vs. Minimum Number of Events")
plt.grid(True)

for x, y in zip(n_display, percent_display):
    plt.text(x, y + 1, f"{y:.1f}%", ha='center', va='bottom', fontsize=9)

plt.show()


In [None]:
min_nb_events = 4  # 4 ou 5 semblent être les seuils les plus pertinents

valid_users_df = (df_all
    .group_by("user_id")
    .len()
    .filter(pl.col("len") >= min_nb_events)
    .select("user_id")
)

print(f"Users with >= {min_nb_events} events: {valid_users_df.height:,}")

df_filtered = df_all.join(valid_users_df, on="user_id", how="inner")

print(f"Original shape: {df_all.shape}")
print(f"Filtered shape: {df_filtered.shape}")

## Feature Engineering

In [None]:
import importlib
import feature_engineering
importlib.reload(feature_engineering)

In [None]:
fe = FeatureEngineering(df_filtered)

engagement_features = fe.calculate_engagement_features(),
purchase_features = fe.calculate_purchase_features(),
product_preference_features = fe.calculate_product_preference_features(),
temporal_features = fe.calculate_temporal_features(),
rfm_features = fe.calculate_rfm_features(),
behavioral_features = fe.calculate_behavioral_features()

In [None]:
feature_dfs = [
    engagement_features[0],
    purchase_features[0],
    product_preference_features[0],
    temporal_features[0],
    rfm_features[0],
    behavioral_features
]

master_features = fe.all_users
for feature_df in feature_dfs:
    master_features = master_features.join(feature_df, on="user_id", how="left")
            
show_df(master_features)

## Analyse de la distribution des features

In [None]:
def visualize_features_distribution(df):
    df_pandas = df.to_pandas()
    features = df_pandas.drop(columns="user_id")
    n_features = len(features.columns)
    # Create a large figure with subplots
    fig, axes = plt.subplots(n_features, 2, figsize=(12, 3*n_features))
    fig.suptitle('Feature Distribution Analysis', fontsize=16)

    # Handle case where there's only one feature
    if n_features == 1:
        axes = axes.reshape(1, -1)

    for i, feature in enumerate(features.columns):
        # Get the data for this feature
        feature_data = features[feature].dropna()
        
        # Boxplot (left subplot)
        ax_box = axes[i, 0]
        box_plot = ax_box.boxplot(feature_data, vert=True, patch_artist=True)
        box_plot['boxes'][0].set_facecolor('lightblue')
        box_plot['boxes'][0].set_alpha(0.7)
        ax_box.set_title(f'{feature} - Boxplot', fontsize=10)
        ax_box.set_ylabel('Value')
        ax_box.grid(True, alpha=0.3)
        
        # Add statistics text
        stats_text = f'Mean: {feature_data.mean():.2f}\nMedian: {feature_data.median():.2f}\nStd: {feature_data.std():.2f}'
        ax_box.text(0.02, 0.98, stats_text, transform=ax_box.transAxes, 
                    verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                    fontsize=8)
        
        # Histogram (right subplot)
        ax_hist = axes[i, 1]
        n_bins = min(50, max(10, int(np.sqrt(len(feature_data)))))  # Adaptive number of bins
        ax_hist.hist(feature_data, bins=n_bins, alpha=0.7, color='skyblue', edgecolor='black', linewidth=0.5)
        ax_hist.set_title(f'{feature} - Histogram', fontsize=10)
        ax_hist.set_xlabel('Value')
        ax_hist.set_ylabel('Frequency')
        ax_hist.grid(True, alpha=0.3)
        
        # Add distribution info
        skewness = feature_data.skew()
        kurtosis = feature_data.kurtosis()
        dist_text = f'Skew: {skewness:.2f}\nKurtosis: {kurtosis:.2f}\nN: {len(feature_data):,}'
        ax_hist.text(0.98, 0.98, dist_text, transform=ax_hist.transAxes, 
                    verticalalignment='top', horizontalalignment='right',
                    bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                    fontsize=8)

    # Adjust layout
    plt.tight_layout()
    plt.show()

In [None]:
for feature_df in feature_dfs:
    visualize_features_distribution(feature_df)