In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

# Configurer explicitement les chemins afin d'éviter les mauvaises surprises liées à l'environnement lors des réexécutions du notebook.
DATA_DIRECTORY: Path = Path("..") / "data"
RAW_DATASET_PATH: Path = DATA_DIRECTORY / "dataset.csv"
CLEANED_DATASET_PATH: Path = DATA_DIRECTORY / "dataset_cleaned.csv"
PLOTS_DISTRIBUTION_DIR: Path = Path("..") / "plots" / "distribution"

# Exploration univariée – Jeu de données Weigh Lifestyle
### Objectifs
- Quantifier la distribution des variables comportementales, physiologiques et nutritionnelles.
- Détecter les valeurs improbables avant la modélisation multivariée.
- Produire des artefacts réutilisables : jeu de données nettoyé et graphiques de distribution.

### Ressources de données
- Jeu de données brut : `../data/dataset.csv`
- Export nettoyé : `../data/dataset_cleaned.csv`
- Graphiques générés : `../plots/distribution/`



## Plan d'analyse
### Étape 1 – Charger les observations brutes
- Lire le CSV dans un DataFrame et exposer la structure pour les étapes suivantes.

### Étape 2 – Éliminer les fuites et les variables peu informatives
- Supprimer les colonnes qui divulguent la cible ou ajoutent du bruit redondant.

### Étape 3 – Harmoniser les échelles des variables
- Arrondir les métriques discrètes et limiter les mesures continues à deux décimales.
- Enregistrer le jeu de données nettoyé pour le réutiliser dans les autres notebooks.

### Étape 4 – Inspecter les distributions
- Générer des histogrammes pour chaque variable numérique informative.
- Sauvegarder les graphiques dans le dossier de distribution.

### Étape 5 – Mettre en évidence les valeurs aberrantes extrêmes
- Appliquer une règle IQR extrême pour signaler les valeurs à examiner manuellement.



In [2]:
def load_dataset(path: Path = RAW_DATASET_PATH) -> pd.DataFrame:
    """Load dataset from CSV file."""
    return pd.read_csv(path)

df = load_dataset()

In [3]:
def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Remove columns that would leak target information or add noise."""
    # Empêcher les variables morphologiques de divulguer directement la taille corporelle.
    # Retirer les proxys de résultats d'entraînement pour concentrer la modélisation sur les signaux comportementaux.
    cols_to_drop = [
        "expected_burn",
        "meal_name",  # Single-category column provides no discriminative power.
        "meal_type",  # Encodes meal ordering rather than nutritional detail.
        "Name of Exercise",  # Verbose label with minimal predictive value.
        "Sets",  # Workout log metadata noisy for univariate analysis.
        "Reps",  # Workout log metadata noisy for univariate analysis.
        "Benefit",  # Subjective textual scale unsuitable for numeric distribution analysis.
        "Target Muscle Group",  # High-cardinality categorical data without aggregation value here.
        "Equipment Needed",  # Captures equipment catalog rather than participant behaviour.
        "Difficulty Level",  # Instructor-provided rating not comparable across entries.
        "Body Part",  # Redundant with target muscle field in this context.
        "Type of Muscle",  # Biological category overlapping with target muscle group.
        "Workout",  # Broad workout taxonomy already encoded elsewhere.
        "pct_carbs",  # Redundant with carbohydrate gram feature.
        "cal_from_macros",  # Decision made based on the correlation matrix.
        "pct_HRR",  # Decision made based on the correlation matrix.
        "BMI", # Too much correlated with BMI_calc
        "Burns Calories (per 30 min)_bc", # Weird values
        "Height (m)", # Too much information with weight
        "Fat_Percentage", # Too much information with weight
        "Calories", # Too much information
        "cal_balance", # Too much information
        "BMI_calc", # Too much information
        "lean_mass_kg", # Too much information
        "protein_per_kg",
        "Calories_Burned", # Give indirect information about weight
        "Burns Calories (per 30 min)_bc", # Give indirect information about weight
        "Burns_Calories_Bin", # Give indirect information about weight
        "Burns Calories (per 30 min)", # Give indirect information about weight
        "Carbs", # Too much information with weight
        "Proteins", # Too much information with weight
        "Fats", # Too much information with weight

]
    
    # Programmer de manière défensive pour éviter les erreurs si les colonnes ont déjà été supprimées
    cols_to_drop_existing = [col for col in cols_to_drop if col in df.columns]
    df_cleaned = df.drop(columns=cols_to_drop_existing)
    
    print(f"Dropped {len(cols_to_drop_existing)} columns")
    print(f"Remaining columns: {df_cleaned.shape[1]}")
    
    return df_cleaned

df_cleaned = clean_dataset(df)


Dropped 32 columns
Remaining columns: 23


In [4]:
def round_dataset_values(df: pd.DataFrame) -> pd.DataFrame:
    """Return a rounded copy that respects discrete and continuous measurement scales."""
    df_rounded = df.copy()

    # Utiliser la partie entière pour l'âge afin de ne pas surestimer les segments démographiques.
    if 'Age' in df_rounded.columns:
        df_rounded['Age'] = np.floor(df_rounded['Age']).astype(int)
    
    # Forcer les entiers sur les mesures discrètes relevées manuellement.
    cols_to_int = [
        'Max_BPM', 'Avg_BPM', 'Resting_BPM',
        'Workout_Frequency (days/week)', 'Experience_Level',
        'Daily meals frequency', 'Physical exercise',
        'Calories', 'sodium_mg', 'cholesterol_mg',
        'prep_time_min', 'cook_time_min'
    ]
    for col in cols_to_int:
        if col in df_rounded.columns:
            df_rounded[col] = df_rounded[col].round(0).astype(int)
    
    # Limiter les mesures continues à deux décimales pour réduire le bruit tout en restant lisible.
    cols_to_2dec = [
        'Session_Duration (hours)', 'Water_Intake (liters)',
        'sugar_g', 'serving_size_g', 'rating',
        'pct_maxHR', "burns-calories-(per-30-min)-bc"
    ]
    for col in cols_to_2dec:
        if col in df_rounded.columns:
            df_rounded[col] = df_rounded[col].round(2)
    
    
    return df_rounded

df_cleaned = round_dataset_values(df_cleaned)


In [5]:
# Sauvegarder le jeu de données arrondi pour réutiliser exactement les mêmes données nettoyées.
df_cleaned.to_csv(CLEANED_DATASET_PATH, index=False)
print(f"Dataset rounded and saved to {CLEANED_DATASET_PATH}")
print(df_cleaned.head())



Dataset rounded and saved to ../data/dataset_cleaned.csv
   Age  Gender  Weight (kg)  Max_BPM  Avg_BPM  Resting_BPM  \
0   34    Male        65.27      189      158           69   
1   23  Female        56.41      179      132           73   
2   33  Female        58.98      175      124           55   
3   38  Female        93.78      191      155           50   
4   45    Male        52.42      194      153           71   

   Session_Duration (hours) Workout_Type  Water_Intake (liters)  \
0                      1.00     Strength                   1.50   
1                      1.37         HIIT                   1.90   
2                      0.91       Cardio                   1.88   
3                      1.10         HIIT                   2.50   
4                      1.08     Strength                   2.91   

   Workout_Frequency (days/week)  ...   diet_type  sugar_g  sodium_mg  \
0                              4  ...       Vegan    31.77       1730   
1                    

In [6]:
def _sanitize_filename(column_name: str) -> str:
    """Convert column name to safe filename by removing problematic characters."""
    return column_name.replace('/', '_').replace(' ', '_').replace('(', '').replace(')', '')


def _get_columns_with_variance(df: pd.DataFrame) -> list[str]:
    """Return list of numerical columns with variance > 0."""
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Les colonnes constantes (variance = 0) n'apportent aucune information de distribution.
    return [col for col in numerical_cols if df[col].var() > 0]


def plot_numerical_distributions(
    df: pd.DataFrame, output_dir: Path = PLOTS_DISTRIBUTION_DIR
) -> tuple[list[Path], dict[str, str]]:
    """Plot histograms for numerical columns with variance and record any failures.
    
    Avoids constant columns to keep the visual portfolio meaningful.
    Returns both the saved figure paths and any plotting errors for transparency.
    """
    import matplotlib
    matplotlib.use('Agg')  # Non-interactive backend to avoid Jupyter conflicts
    from matplotlib import pyplot as plt_local
    
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    cols_to_plot = _get_columns_with_variance(df)

    saved_paths: list[Path] = []
    errors: dict[str, str] = {}

    for col in cols_to_plot:
        fig = None
        try:
            # Convertir rapidement en tableau NumPy pour éviter les problèmes d'histogramme entre pandas et matplotlib.
            data = np.array(df[col].dropna().values, dtype=float)
            if len(data) == 0:
                continue

            fig, ax = plt_local.subplots(figsize=(10, 6))
            ax.hist(data, bins=30, edgecolor='black', alpha=0.7)
            ax.set_title(f'Distribution of {col}', fontsize=14, fontweight='bold')
            ax.set_xlabel(col, fontsize=12)
            ax.set_ylabel('Frequency', fontsize=12)
            ax.grid(axis='y', alpha=0.3)

            filepath = output_path / f'{_sanitize_filename(col)}_distribution.png'
            fig.savefig(filepath, dpi=150, bbox_inches='tight')
            saved_paths.append(filepath)
        except Exception as exc:
            errors[col] = str(exc)
        finally:
            if fig is not None:
                plt_local.close(fig)

    return saved_paths, errors


# Remonter l'état du traçage à l'extérieur pour garder la fonction sans effets de bord.
saved_plots, plot_errors = plot_numerical_distributions(df_cleaned)
print(f"Saved {len(saved_plots)} distribution plots to {PLOTS_DISTRIBUTION_DIR}")
if plot_errors:
    print("Plotting issues detected:")
    for column, message in plot_errors.items():
        print(f"  {column}: {message}")


Saved 19 distribution plots to ../plots/distribution


In [7]:
def detect_outliers_iqr(df: pd.DataFrame) -> dict[str, dict[str, float]]:
    """Detect extreme outliers using IQR method for all numerical columns.

    Returns a report only for columns containing outliers.
    Extreme outliers are defined as values < Q1 - 3.0*IQR or > Q3 + 3.0*IQR.
    Using factor 3.0 instead of 1.5 to detect only truly extreme values.

    Args:
        df: Input DataFrame with numerical columns

    Returns:
        Dictionary with column names as keys and outlier statistics as values.
        Statistics include: count, percentage, min_outlier, max_outlier,
        lower_bound, upper_bound.
    """
    outlier_report = {}
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    for col in numerical_cols:
        # Ignorer les colonnes sans variance car elles ne peuvent pas contenir de valeurs aberrantes.
        if df[col].var() == 0:
            continue

        data = df[col].dropna()

        # Calculer les bornes de l'IQR avec un facteur 3.0 pour ne cibler que les valeurs extrêmes.
        Q1 = data.quantile(0.25)
        Q3 = data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 3.0 * IQR
        upper_bound = Q3 + 3.0 * IQR

        # Identifier les valeurs aberrantes
        outliers = data[(data < lower_bound) | (data > upper_bound)]

        # Ne produire un rapport que lorsqu'au moins une valeur aberrante est détectée.
        if len(outliers) > 0:
            outlier_report[col] = {
                'count': len(outliers),
                'percentage': round(len(outliers) / len(data) * 100, 2),
                'min_outlier': float(outliers.min()),
                'max_outlier': float(outliers.max()),
                'lower_bound': round(lower_bound, 2),
                'upper_bound': round(upper_bound, 2)
            }

    return outlier_report


# Détecter et signaler les valeurs aberrantes
outlier_report = detect_outliers_iqr(df_cleaned)

if outlier_report:
    print(f"Found outliers in {len(outlier_report)} column(s):\n")
    sorted_outliers = sorted(
        outlier_report.items(),
        key=lambda item: item[1]['count'],
        reverse=True
    )
    max_rows = 15
    for col, stats in sorted_outliers[:max_rows]:
        print(f"{col}:")
        print(f"  Count: {stats['count']} ({stats['percentage']}%)")
        print(f"  Range: [{stats['min_outlier']}, {stats['max_outlier']}]")
        print(f"  Valid bounds: [{stats['lower_bound']}, {stats['upper_bound']}]")
        print()
    if len(sorted_outliers) > max_rows:
        print(f"... {len(sorted_outliers) - max_rows} colonnes supplémentaires non affichées")
else:
    print("No outliers detected in any column.")


Found outliers in 1 column(s):

Physical exercise:
  Count: 4695 (23.47%)
  Range: [1.0, 4.0]
  Valid bounds: [0.0, 0.0]



Il est possible de regrouper le nombre d'exercices physiques en 0 et > 0 afin de réduire le déséquilibre entre l'absence d'activité et les autres modalités.