In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
"""
INF6083 - Projet P1 : T√¢che 0 - √âchantillonnage Strat√©gique
BLOC 1 : Configuration et Imports
"""

import pandas as pd
import numpy as np
from datetime import datetime
import json
import gzip
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import os
import pickle
warnings.filterwarnings('ignore')

# Configuration
np.random.seed(42)
sns.set_style("whitegrid")

print("="*70)
print("√âCHANTILLONNAGE STRAT√âGIQUE - AMAZON BOOKS 2023")
print("="*70)

# ============================================================================
# CONFIGURATION DES PARAM√àTRES
# ============================================================================
REVIEWS_URL = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/Books.jsonl"
METADATA_URL = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Books.jsonl"

# Param√®tres d'√©chantillonnage
MIN_REVIEWS_PER_USER = 20
TARGET_USERS = 50000
TARGET_REVIEWS_MIN = 500000
TARGET_REVIEWS_MAX = 2000000

# Param√®tres temporels
START_YEAR = 2020
END_YEAR = 2023

# Fichiers de sauvegarde interm√©diaires
CHECKPOINT_DIR = "checkpoints"
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print("\n‚úÖ Configuration charg√©e avec succ√®s!")
print(f"   ‚Ä¢ MIN_REVIEWS_PER_USER: {MIN_REVIEWS_PER_USER}")
print(f"   ‚Ä¢ TARGET_USERS: {TARGET_USERS:,}")
print(f"   ‚Ä¢ P√âRIODE TEMPORELLE: {START_YEAR}-{END_YEAR}")
print(f"   ‚Ä¢ CHECKPOINTS: {CHECKPOINT_DIR}/")

In [None]:
"""
BLOC 2 : Fonction de chargement et analyse de l'√©chantillon initial
"""

import requests

def load_jsonl_streaming(url, max_lines=None, show_progress=True):
    """Charge un fichier JSONL de mani√®re efficace avec streaming"""
    print(f"\nüì• Chargement depuis: {url}")
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    data = []
    line_count = 0
    
    for line in tqdm(response.iter_lines(decode_unicode=True), desc="Lecture", disable=not show_progress):
        if not line:
            continue
        
        try:
            data.append(json.loads(line))
            line_count += 1
            
            if max_lines and line_count >= max_lines:
                break
                
        except json.JSONDecodeError:
            continue
    
    print(f"‚úÖ {len(data):,} lignes charg√©es")
    return pd.DataFrame(data)

# ============================================================================
# CHARGEMENT √âCHANTILLON INITIAL (skip si d√©j√† fait)
# ============================================================================
SAMPLE_FILE = f"{CHECKPOINT_DIR}/sample_initial.pkl"

if os.path.exists(SAMPLE_FILE):
    print("\n‚ôªÔ∏è  √âchantillon initial d√©tect√© - Chargement depuis le checkpoint...")
    sample_df = pd.read_pickle(SAMPLE_FILE)
    print(f"‚úÖ {len(sample_df):,} lignes charg√©es depuis {SAMPLE_FILE}")
else:
    print("\n" + "="*70)
    print("PHASE 1: CHARGEMENT DE L'√âCHANTILLON INITIAL")
    print("="*70)
    
    sample_df = load_jsonl_streaming(REVIEWS_URL, max_lines=100000)
    
    # Sauvegarder le checkpoint
    sample_df.to_pickle(SAMPLE_FILE)
    print(f"\nüíæ Checkpoint sauvegard√©: {SAMPLE_FILE}")

# Afficher les infos
print("\nüìã Structure des donn√©es:")
print(sample_df.head(3))
print(f"\nüìè Colonnes: {list(sample_df.columns)}")
print(f"üìè Lignes: {len(sample_df):,}")

# V√©rifier format timestamp
sample_timestamp = sample_df['timestamp'].iloc[0]
print(f"\nüïê Format timestamp:")
if sample_timestamp > 1e12:
    print(f"   ‚Ä¢ Format: millisecondes")
    print(f"   ‚Ä¢ Exemple converti: {datetime.fromtimestamp(sample_timestamp/1000).strftime('%Y-%m-%d')}")
else:
    print(f"   ‚Ä¢ Format: secondes")
    print(f"   ‚Ä¢ Exemple: {datetime.fromtimestamp(sample_timestamp).strftime('%Y-%m-%d')}")

# Analyse pr√©liminaire
user_counts = sample_df['user_id'].value_counts()
print(f"\nüìä Statistiques (√©chantillon):")
print(f"   ‚Ä¢ Utilisateurs uniques: {len(user_counts):,}")
print(f"   ‚Ä¢ Moyenne reviews/user: {user_counts.mean():.2f}")
print(f"   ‚Ä¢ M√©diane: {user_counts.median():.0f}")
print(f"   ‚Ä¢ Utilisateurs ‚â•{MIN_REVIEWS_PER_USER} reviews: {(user_counts >= MIN_REVIEWS_PER_USER).sum():,}")

print("\n‚úÖ BLOC 2 TERMIN√â")

In [None]:
"""
BLOC 3 : PASSE 1 - Comptage des √©valuations par utilisateur
‚è±Ô∏è Dur√©e estim√©e: ~15 minutes
"""

USER_COUNTS_FILE = f"{CHECKPOINT_DIR}/user_counts.pkl"
SELECTED_USERS_FILE = f"{CHECKPOINT_DIR}/selected_users.pkl"

if os.path.exists(SELECTED_USERS_FILE):
    print("\n‚ôªÔ∏è  Utilisateurs d√©j√† s√©lectionn√©s - Skip PASSE 1")
    with open(SELECTED_USERS_FILE, 'rb') as f:
        selected_users = pickle.load(f)
    print(f"‚úÖ {len(selected_users):,} utilisateurs charg√©s depuis {SELECTED_USERS_FILE}")
    
else:
    print("\n" + "="*70)
    print("PHASE 3: PASSE 1 - COMPTAGE DES UTILISATEURS")
    print("="*70)
    print(f"\nüéØ Objectif: Identifier utilisateurs avec ‚â•{MIN_REVIEWS_PER_USER} √©valuations")
    
    # V√©rifier si le comptage existe d√©j√†
    if os.path.exists(USER_COUNTS_FILE):
        print("\n‚ôªÔ∏è  Comptage existant d√©tect√© - Chargement...")
        with open(USER_COUNTS_FILE, 'rb') as f:
            user_review_counts = pickle.load(f)
        print(f"‚úÖ {len(user_review_counts):,} utilisateurs charg√©s")
    else:
        print("\nüì• Comptage en cours (peut prendre ~15 minutes)...")
        user_review_counts = Counter()
        
        response = requests.get(REVIEWS_URL, stream=True)
        
        for line in tqdm(response.iter_lines(decode_unicode=True), desc="Comptage"):
            if not line:
                continue
            try:
                review = json.loads(line)
                user_review_counts[review['user_id']] += 1
            except (json.JSONDecodeError, KeyError):
                continue
        
        # Sauvegarder le comptage
        with open(USER_COUNTS_FILE, 'wb') as f:
            pickle.dump(user_review_counts, f)
        print(f"\nüíæ Comptage sauvegard√©: {USER_COUNTS_FILE}")
        print(f"‚úÖ {len(user_review_counts):,} utilisateurs uniques")
    
    # Identifier utilisateurs actifs
    active_users = [user for user, count in user_review_counts.items() 
                    if count >= MIN_REVIEWS_PER_USER]
    
    print(f"\nüìä Utilisateurs actifs (‚â•{MIN_REVIEWS_PER_USER} reviews): {len(active_users):,}")
    
    # √âchantillonner
    if len(active_users) > TARGET_USERS:
        selected_users = set(np.random.choice(active_users, size=TARGET_USERS, replace=False))
        print(f"‚úÖ {TARGET_USERS:,} utilisateurs s√©lectionn√©s al√©atoirement")
    else:
        selected_users = set(active_users)
        print(f"‚ö†Ô∏è  {len(selected_users):,} utilisateurs disponibles (< {TARGET_USERS:,})")
    
    # Sauvegarder la s√©lection
    with open(SELECTED_USERS_FILE, 'wb') as f:
        pickle.dump(selected_users, f)
    print(f"üíæ S√©lection sauvegard√©e: {SELECTED_USERS_FILE}")

print(f"\n‚úÖ BLOC 3 TERMIN√â - {len(selected_users):,} utilisateurs s√©lectionn√©s")

In [None]:
"""
BLOC 4 : PASSE 2 - Extraction des √©valuations des utilisateurs s√©lectionn√©s
‚è±Ô∏è Dur√©e estim√©e: ~12 minutes
"""

ACTIVE_SAMPLE_FILE = f"{CHECKPOINT_DIR}/df_active.pkl"

if os.path.exists(ACTIVE_SAMPLE_FILE):
    print("\n‚ôªÔ∏è  √âchantillon 'Utilisateurs Actifs' d√©j√† cr√©√© - Skip PASSE 2")
    df_active = pd.read_pickle(ACTIVE_SAMPLE_FILE)
    print(f"‚úÖ {len(df_active):,} reviews charg√©es depuis {ACTIVE_SAMPLE_FILE}")
    
else:
    print("\n" + "="*70)
    print("PHASE 4: PASSE 2 - EXTRACTION DES √âVALUATIONS")
    print("="*70)
    print(f"\nüì• Extraction des reviews de {len(selected_users):,} utilisateurs...")
    print("‚è±Ô∏è  Dur√©e estim√©e: ~12 minutes")
    
    response = requests.get(REVIEWS_URL, stream=True)
    sampled_reviews_active = []
    
    for line in tqdm(response.iter_lines(decode_unicode=True), desc="Extraction"):
        if not line:
            continue
        try:
            review = json.loads(line)
            if review['user_id'] in selected_users:
                sampled_reviews_active.append(review)
        except (json.JSONDecodeError, KeyError):
            continue
    
    df_active = pd.DataFrame(sampled_reviews_active)
    
    # Convertir timestamps (millisecondes ‚Üí secondes)
    print("\nüßπ Conversion des timestamps...")
    sample_ts = df_active['timestamp'].iloc[0]
    if sample_ts > 1e12:
        print("   ‚ÑπÔ∏è  Conversion: millisecondes ‚Üí secondes")
        df_active['timestamp'] = df_active['timestamp'] / 1000
    
    # Filtrer timestamps invalides
    n_before = len(df_active)
    mask = (df_active['timestamp'] >= 946684800) & (df_active['timestamp'] <= 1893456000)
    df_active = df_active[mask].copy()
    n_after = len(df_active)
    
    if n_before != n_after:
        print(f"‚ö†Ô∏è  {n_before - n_after:,} timestamps invalides supprim√©s ({((n_before-n_after)/n_before*100):.2f}%)")
    
    # Convertir en ann√©es
    df_active['year'] = pd.to_datetime(df_active['timestamp'], unit='s').dt.year
    
    # Sauvegarder
    df_active.to_pickle(ACTIVE_SAMPLE_FILE)
    print(f"\nüíæ √âchantillon sauvegard√©: {ACTIVE_SAMPLE_FILE}")

# Afficher les statistiques
print(f"\n‚úÖ √âCHANTILLON 'UTILISATEURS ACTIFS':")
print(f"   ‚Ä¢ Reviews: {len(df_active):,}")
print(f"   ‚Ä¢ Utilisateurs: {df_active['user_id'].nunique():,}")
print(f"   ‚Ä¢ Livres: {df_active['parent_asin'].nunique():,}")
print(f"   ‚Ä¢ P√©riode: {df_active['year'].min()} - {df_active['year'].max()}")
print(f"   ‚Ä¢ Rating moyen: {df_active['rating'].mean():.2f}")

print("\n‚úÖ BLOC 4 TERMIN√â")

In [None]:
"""
BLOC 5 : √âchantillonnage temporel (2020-2023)
‚è±Ô∏è Dur√©e estim√©e: ~3 minutes
‚ö†Ô∏è  OPTIONNEL - Peut √™tre skipp√© si vous voulez seulement la strat√©gie 1
"""

TEMPORAL_SAMPLE_FILE = f"{CHECKPOINT_DIR}/df_temporal.pkl"

# Demander confirmation avant de lancer (car prend du temps)
RUN_TEMPORAL = True  # Mettre False pour skip

if not RUN_TEMPORAL:
    print("\n‚è≠Ô∏è  BLOC 5 SKIPP√â (√©chantillonnage temporel d√©sactiv√©)")
    df_temporal = pd.DataFrame()  # DataFrame vide
    
elif os.path.exists(TEMPORAL_SAMPLE_FILE):
    print("\n‚ôªÔ∏è  √âchantillon temporel d√©j√† cr√©√© - Skip")
    df_temporal = pd.read_pickle(TEMPORAL_SAMPLE_FILE)
    print(f"‚úÖ {len(df_temporal):,} reviews charg√©es depuis {TEMPORAL_SAMPLE_FILE}")
    
else:
    print("\n" + "="*70)
    print("PHASE 5: √âCHANTILLONNAGE TEMPOREL")
    print("="*70)
    print(f"\nüéØ Objectif: Reviews de {START_YEAR} √† {END_YEAR}")
    
    # Timestamps en millisecondes
    start_ts_ms = datetime(START_YEAR, 1, 1).timestamp() * 1000
    end_ts_ms = datetime(END_YEAR + 1, 1, 1).timestamp() * 1000
    
    print(f"\nüì• Filtrage temporel en cours...")
    
    response = requests.get(REVIEWS_URL, stream=True)
    sampled_reviews_temporal = []
    
    for line in tqdm(response.iter_lines(decode_unicode=True), desc="Filtrage"):
        if not line:
            continue
        try:
            review = json.loads(line)
            timestamp = review.get('timestamp', 0)
            if start_ts_ms <= timestamp < end_ts_ms:
                sampled_reviews_temporal.append(review)
        except (json.JSONDecodeError, KeyError):
            continue
    
    df_temporal = pd.DataFrame(sampled_reviews_temporal)
    
    if len(df_temporal) > 0:
        # Convertir timestamps
        df_temporal['timestamp'] = df_temporal['timestamp'] / 1000
        df_temporal['year'] = pd.to_datetime(df_temporal['timestamp'], unit='s').dt.year
        
        # Sauvegarder
        df_temporal.to_pickle(TEMPORAL_SAMPLE_FILE)
        print(f"\nüíæ √âchantillon temporel sauvegard√©: {TEMPORAL_SAMPLE_FILE}")
    else:
        print("\n‚ö†Ô∏è  Aucune donn√©e trouv√©e pour cette p√©riode")

# Afficher statistiques
if len(df_temporal) > 0:
    print(f"\n‚úÖ √âCHANTILLON TEMPOREL:")
    print(f"   ‚Ä¢ Reviews: {len(df_temporal):,}")
    print(f"   ‚Ä¢ Utilisateurs: {df_temporal['user_id'].nunique():,}")
    print(f"   ‚Ä¢ Livres: {df_temporal['parent_asin'].nunique():,}")
    print(f"   ‚Ä¢ P√©riode: {df_temporal['year'].min()} - {df_temporal['year'].max()}")
else:
    print("\n‚ö†Ô∏è  Pas d'√©chantillon temporel disponible")

print("\n‚úÖ BLOC 5 TERMIN√â")

In [None]:
"""
BLOC 6 : Comparaison des strat√©gies et recommandation
"""

print("\n" + "="*70)
print("PHASE 6: COMPARAISON ET RECOMMANDATION")
print("="*70)

# Tableau comparatif
comparison_data = {
    'M√©trique': [
        'Nombre de reviews',
        'Nombre d\'utilisateurs',
        'Nombre de livres',
        'Reviews/utilisateur (moy)',
        'Reviews/livre (moy)',
        'Taux de sparsit√© (%)'
    ],
    'Utilisateurs Actifs': [
        f"{len(df_active):,}",
        f"{df_active['user_id'].nunique():,}",
        f"{df_active['parent_asin'].nunique():,}",
        f"{len(df_active) / df_active['user_id'].nunique():.2f}",
        f"{len(df_active) / df_active['parent_asin'].nunique():.2f}",
        f"{(1 - len(df_active) / (df_active['user_id'].nunique() * df_active['parent_asin'].nunique())) * 100:.4f}"
    ]
}

if len(df_temporal) > 0:
    comparison_data['Temporel'] = [
        f"{len(df_temporal):,}",
        f"{df_temporal['user_id'].nunique():,}",
        f"{df_temporal['parent_asin'].nunique():,}",
        f"{len(df_temporal) / df_temporal['user_id'].nunique():.2f}",
        f"{len(df_temporal) / df_temporal['parent_asin'].nunique():.2f}",
        f"{(1 - len(df_temporal) / (df_temporal['user_id'].nunique() * df_temporal['parent_asin'].nunique())) * 100:.4f}"
    ]
else:
    comparison_data['Temporel'] = ['N/A'] * 6

comparison_df = pd.DataFrame(comparison_data)
print("\nüìä TABLEAU COMPARATIF:")
print(comparison_df.to_string(index=False))

# Justification
print("\nüìù JUSTIFICATION DES STRAT√âGIES:")
print("\n1Ô∏è‚É£ STRAT√âGIE UTILISATEURS ACTIFS:")
print("   ‚úÖ Avantages:")
print("      ‚Ä¢ Garantit des utilisateurs avec profils riches")
print("      ‚Ä¢ R√©duit la sparsit√© de la matrice")
print("      ‚Ä¢ Facilite le calcul de similarit√©s")
print("      ‚Ä¢ Volum√©trie contr√¥l√©e et pr√©visible")
print("   ‚ö†Ô∏è  Inconv√©nients:")
print("      ‚Ä¢ Biais vers les 'power users'")
print("      ‚Ä¢ Moins repr√©sentatif de l'utilisateur moyen")

print("\n2Ô∏è‚É£ STRAT√âGIE TEMPORELLE:")
print("   ‚úÖ Avantages:")
print("      ‚Ä¢ Donn√©es r√©centes et pertinentes")
print("      ‚Ä¢ Moins de biais de s√©lection")
print("      ‚Ä¢ Refl√®te les tendances actuelles")
print("   ‚ö†Ô∏è  Inconv√©nients:")
print("      ‚Ä¢ Volum√©trie variable selon la p√©riode")
print("      ‚Ä¢ Beaucoup d'utilisateurs occasionnels")

# Recommandation
active_in_range = TARGET_REVIEWS_MIN <= len(df_active) <= TARGET_REVIEWS_MAX
temporal_in_range = len(df_temporal) > 0 and TARGET_REVIEWS_MIN <= len(df_temporal) <= TARGET_REVIEWS_MAX

print("\nüéØ RECOMMANDATION:")
if active_in_range:
    recommended = "Utilisateurs Actifs"
    print(f"   ‚úÖ Strat√©gie recommand√©e: {recommended}")
    print(f"   ‚Ä¢ Volum√©trie dans la cible ({TARGET_REVIEWS_MIN:,} - {TARGET_REVIEWS_MAX:,})")
    final_df = df_active
    strategy_name = "active_users"
elif temporal_in_range:
    recommended = "Temporelle"
    print(f"   ‚úÖ Strat√©gie recommand√©e: {recommended}")
    final_df = df_temporal
    strategy_name = "temporal"
else:
    print(f"   ‚ö†Ô∏è  Aucune strat√©gie n'atteint exactement la cible")
    print(f"   ‚Ä¢ Recommandation par d√©faut: Utilisateurs Actifs")
    final_df = df_active
    strategy_name = "active_users"

print(f"\n‚úÖ Strat√©gie finale: {strategy_name.upper()}")
print("\n‚úÖ BLOC 6 TERMIN√â")

In [None]:
"""
BLOC 7 : Sauvegarde des fichiers finaux (CSV + JSON)
"""

print("\n" + "="*70)
print("PHASE 7: SAUVEGARDE DES FICHIERS FINAUX")
print("="*70)

# Sauvegarder les √©chantillons CSV
print("\nüíæ Sauvegarde des fichiers CSV...")

df_active.to_csv("amazon_books_sample_active_users.csv", index=False)
print(f"‚úÖ amazon_books_sample_active_users.csv ({len(df_active):,} lignes)")

if len(df_temporal) > 0:
    df_temporal.to_csv("amazon_books_sample_temporal.csv", index=False)
    print(f"‚úÖ amazon_books_sample_temporal.csv ({len(df_temporal):,} lignes)")

# Fichier recommand√©
output_file = f"amazon_books_sample_{strategy_name}.csv"
final_df.to_csv(output_file, index=False)
print(f"‚úÖ {output_file} (recommand√©) ({len(final_df):,} lignes)")

# M√©tadonn√©es JSON
metadata = {
    'date_creation': datetime.now().isoformat(),
    'strategie_recommandee': strategy_name,
    'parametres': {
        'min_reviews_per_user': MIN_REVIEWS_PER_USER,
        'target_users': TARGET_USERS,
        'target_reviews_range': [TARGET_REVIEWS_MIN, TARGET_REVIEWS_MAX],
        'temporal_range': [START_YEAR, END_YEAR]
    },
    'resultats': {
        'active_users': {
            'n_reviews': len(df_active),
            'n_users': int(df_active['user_id'].nunique()),
            'n_items': int(df_active['parent_asin'].nunique()),
            'periode': f"{df_active['year'].min()}-{df_active['year'].max()}",
            'sparsity': float((1 - len(df_active) / (df_active['user_id'].nunique() * df_active['parent_asin'].nunique())) * 100)
        },
        'temporal': {
            'n_reviews': len(df_temporal) if len(df_temporal) > 0 else 0,
            'n_users': int(df_temporal['user_id'].nunique()) if len(df_temporal) > 0 else 0,
            'n_items': int(df_temporal['parent_asin'].nunique()) if len(df_temporal) > 0 else 0,
            'periode': f"{df_temporal['year'].min()}-{df_temporal['year'].max()}" if len(df_temporal) > 0 else "N/A"
        }
    }
}

with open('sampling_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print("‚úÖ sampling_metadata.json")

print("\nüìÅ FICHIERS G√âN√âR√âS:")
print("   ‚Ä¢ amazon_books_sample_active_users.csv")
if len(df_temporal) > 0:
    print("   ‚Ä¢ amazon_books_sample_temporal.csv")
print(f"   ‚Ä¢ {output_file} (recommand√©)")
print("   ‚Ä¢ sampling_metadata.json")

print("\n‚úÖ BLOC 7 TERMIN√â")

In [None]:
"""
BLOC 8 : Visualisations et rapport final
"""

print("\n" + "="*70)
print("PHASE 8: VISUALISATIONS ET RAPPORT FINAL")
print("="*70)

# Cr√©er les visualisations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Distribution des ratings
axes[0, 0].hist(final_df['rating'], bins=5, edgecolor='black', alpha=0.7, color='steelblue')
axes[0, 0].set_xlabel('Rating', fontsize=11)
axes[0, 0].set_ylabel('Fr√©quence', fontsize=11)
axes[0, 0].set_title('Distribution des √âvaluations', fontsize=12, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)

# 2. Reviews par utilisateur
user_review_dist = final_df['user_id'].value_counts()
axes[0, 1].hist(user_review_dist, bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[0, 1].set_xlabel('Nombre de reviews par utilisateur', fontsize=11)
axes[0, 1].set_ylabel('Fr√©quence', fontsize=11)
axes[0, 1].set_title('Distribution: Reviews par Utilisateur', fontsize=12, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)

# 3. Reviews par livre (top 50)
book_review_dist = final_df['parent_asin'].value_counts().head(50)
axes[1, 0].bar(range(len(book_review_dist)), book_review_dist.values, alpha=0.7, color='seagreen')
axes[1, 0].set_xlabel('Livres (top 50)', fontsize=11)
axes[1, 0].set_ylabel('Nombre de reviews', fontsize=11)
axes[1, 0].set_title('Distribution: Reviews par Livre (Top 50)', fontsize=12, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# 4. Distribution temporelle
year_dist = final_df['year'].value_counts().sort_index()
axes[1, 1].plot(year_dist.index, year_dist.values, marker='o', linewidth=2, 
                markersize=8, color='purple')
axes[1, 1].set_xlabel('Ann√©e', fontsize=11)
axes[1, 1].set_ylabel('Nombre de reviews', fontsize=11)
axes[1, 1].set_title('Distribution Temporelle', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('sampling_analysis.png', dpi=300, bbox_inches='tight')
print("\n‚úÖ Visualisation sauvegard√©e: sampling_analysis.png")
plt.show()

# Rapport final
print("\n" + "="*70)
print("üìä RAPPORT FINAL")
print("="*70)

print(f"\nüéØ √âCHANTILLON FINAL ({strategy_name.upper()}):")
print(f"   ‚Ä¢ Nombre de reviews: {len(final_df):,}")
print(f"   ‚Ä¢ Utilisateurs uniques: {final_df['user_id'].nunique():,}")
print(f"   ‚Ä¢ Livres uniques: {final_df['parent_asin'].nunique():,}")
print(f"   ‚Ä¢ Taux de sparsit√©: {(1 - len(final_df) / (final_df['user_id'].nunique() * final_df['parent_asin'].nunique())) * 100:.4f}%")
print(f"   ‚Ä¢ Rating moyen: {final_df['rating'].mean():.2f}")
print(f"   ‚Ä¢ √âcart-type rating: {final_df['rating'].std():.2f}")
print(f"   ‚Ä¢ P√©riode: {final_df['year'].min()} - {final_df['year'].max()}")

print("\nüìà M√âTRIQUES PAR UTILISATEUR:")
reviews_per_user = final_df['user_id'].value_counts()
print(f"   ‚Ä¢ Moyenne: {reviews_per_user.mean():.2f} reviews/utilisateur")
print(f"   ‚Ä¢ M√©diane: {reviews_per_user.median():.0f}")
print(f"   ‚Ä¢ Min: {reviews_per_user.min()}")
print(f"   ‚Ä¢ Max: {reviews_per_user.max()}")

print("\nüìö M√âTRIQUES PAR LIVRE:")
reviews_per_book = final_df['parent_asin'].value_counts()
print(f"   ‚Ä¢ Moyenne: {reviews_per_book.mean():.2f} reviews/livre")
print(f"   ‚Ä¢ M√©diane: {reviews_per_book.median():.0f}")
print(f"   ‚Ä¢ Min: {reviews_per_book.min()}")
print(f"   ‚Ä¢ Max: {reviews_per_book.max()}")

print("\n‚≠ê DISTRIBUTION DES RATINGS:")
for rating in sorted(final_df['rating'].unique()):
    count = (final_df['rating'] == rating).sum()
    pct = count / len(final_df) * 100
    print(f"   ‚Ä¢ {rating:.0f} √©toiles: {count:,} ({pct:.1f}%)")

print("\n" + "="*70)
print("‚úÖ √âCHANTILLONNAGE STRAT√âGIQUE TERMIN√â AVEC SUCC√àS!")
print("="*70)

print("\nüì¶ FICHIERS FINAUX G√âN√âR√âS:")
print("   1. amazon_books_sample_active_users.csv")
if len(df_temporal) > 0:
    print("   2. amazon_books_sample_temporal.csv")
print(f"   3. {output_file} (recommand√©)")
print("   4. sampling_metadata.json")
print("   5. sampling_analysis.png")
print(f"\nüìÅ Checkpoints interm√©diaires: {CHECKPOINT_DIR}/")

print("\n‚úÖ BLOC 8 TERMIN√â - FIN DU SCRIPT")