## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
import sys
import os

sns.set_theme(style="whitegrid")
%matplotlib inline

try:
    sys.path.append(os.path.abspath(os.path.join('..', 'src')))
    from data_loader import FitFamDataLoader
    print("Module data_loader importé avec succès.")
except ImportError:
    print("ERREUR : Le fichier data_loader.py n'est pas trouvé dans le dossier.")

loader = FitFamDataLoader()
unified_df = loader.get_unified_data()

print("Données chargées.")

## 2. Analyse par Année

In [None]:
def analyze_year(year):
    print(f"\n--- Analyse pour l'année {year} ---")
    yearly_df = unified_df[unified_df['start_time'].dt.year == year]
    yearly_df['date'] = yearly_df['start_time']
    yearly_df = yearly_df.sort_values(['user_id', 'date'])

    print(f"Total lignes : {len(yearly_df)}")
    print(f"Total utilisateurs : {yearly_df['user_id'].nunique()}")
    print(f"Période : du {yearly_df['date'].min()} au {yearly_df['date'].max()}")

    # Feature Engineering
    user_start = yearly_df.groupby('user_id')['date'].min().reset_index()
    user_start.columns = ['user_id', 'start_date']
    df = yearly_df.merge(user_start, on='user_id')

    df['days_since_start'] = (df['date'] - df['start_date']).dt.days

    max_date = df['date'].max()
    cutoff_date = max_date - timedelta(days=90)
    valid_users = user_start[user_start['start_date'] <= cutoff_date]['user_id']
    df_cohort = df[df['user_id'].isin(valid_users)].copy()

    print(f"Utilisateurs analysables (>90 jours d'ancienneté) : {df_cohort['user_id'].nunique()}")

    early_activity = df_cohort[df_cohort['days_since_start'] <= 14]

    h1_data = early_activity.groupby('user_id').size().reset_index(name='frequency_14d')

    active_after_90d = df_cohort[df_cohort['days_since_start'] > 90]['user_id'].unique()
    h1_data['is_retained_3m'] = h1_data['user_id'].isin(active_after_90d).astype(int)

    # Regularity Metrics
    def calculate_category_diversity(x):
        if len(x) == 0:
            return np.nan
        unique_cats = x.nunique()
        total_sessions = len(x)
        return unique_cats / total_sessions

    category_reg = early_activity.groupby('user_id')['category_name'].agg(calculate_category_diversity).reset_index(name='category_regularity_14d')

    def calculate_temporal_balance(x):
        if len(x) == 0:
            return np.nan
        weekday_count = x.dt.weekday.lt(5).sum()
        total = len(x)
        if total == 0:
            return np.nan
        weekday_prop = weekday_count / total
        return abs(weekday_prop - 0.5)

    temporal_reg = early_activity.groupby('user_id')['date'].agg(calculate_temporal_balance).reset_index(name='temporal_regularity_14d')

    h1_data = h1_data.merge(category_reg, on='user_id', how='left')
    h1_data = h1_data.merge(temporal_reg, on='user_id', how='left')

    print("Nouvelles métriques de régularité calculées.")
    display(h1_data.head())

    # Statistical Analysis
    valid_cat = h1_data.dropna(subset=['category_regularity_14d'])
    group_retained_cat = valid_cat[valid_cat['is_retained_3m'] == 1]['category_regularity_14d']
    group_churned_cat = valid_cat[valid_cat['is_retained_3m'] == 0]['category_regularity_14d']

    from scipy.stats import mannwhitneyu
    stat, p_value_cat = mannwhitneyu(group_retained_cat, group_churned_cat, alternative='two-sided')

    print("Test sur la régularité par catégorie (diversité) :")
    print(f"P-value = {p_value_cat:.5f}")

    valid_temp = h1_data.dropna(subset=['temporal_regularity_14d'])
    group_retained_temp = valid_temp[valid_temp['is_retained_3m'] == 1]['temporal_regularity_14d']
    group_churned_temp = valid_temp[valid_temp['is_retained_3m'] == 0]['temporal_regularity_14d']

    stat, p_value_temp = mannwhitneyu(group_retained_temp, group_churned_temp, alternative='two-sided')

    print("Test sur la régularité temporelle :")
    print(f"P-value = {p_value_temp:.5f}")

    # Save Results
    h1_data.to_csv(f'h1_exploratory_results_{year}.csv', index=False)
    print(f"Résultats sauvegardés dans h1_exploratory_results_{year}.csv")

# Analyse pour chaque année
for year in [2023, 2024, 2025]:
    analyze_year(year)