# Nettoyage avancé avec Pandas

## Nettoyage des données météo

### Chargement du fichier

In [48]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Chemins
DATA_DIR = "../data_ecf"
OUTPUT_DIR = "../output"



df_meteo = pd.read_csv(f"{DATA_DIR}/meteo_raw.csv")

print(f"Shape: {df_meteo.shape}")
print(f"\nColonnes: {df_meteo.columns.tolist()}")
df_meteo.head(10)
df_meteo.info()

Shape: (252612, 7)

Colonnes: ['commune', 'timestamp', 'temperature_c', 'humidite_pct', 'rayonnement_solaire_wm2', 'vitesse_vent_kmh', 'precipitation_mm']
<class 'pandas.DataFrame'>
RangeIndex: 252612 entries, 0 to 252611
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   commune                  252612 non-null  str    
 1   timestamp                252612 non-null  str    
 2   temperature_c            251383 non-null  str    
 3   humidite_pct             252612 non-null  float64
 4   rayonnement_solaire_wm2  252612 non-null  float64
 5   vitesse_vent_kmh         252612 non-null  float64
 6   precipitation_mm         252612 non-null  float64
dtypes: float64(4), str(3)
memory usage: 20.5 MB


### Standardiser les formats de dates

In [49]:
def parse_timestamp(ts):
    """Parse les timestamps multi-formats."""
    if pd.isna(ts):
        return pd.NaT
    
    formats = [
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M",
        "%m/%d/%Y %H:%M:%S",
        "%Y-%m-%dT%H:%M:%S",
    ]
    
    for fmt in formats:
        try:
            return datetime.strptime(str(ts), fmt)
        except ValueError:
            continue
    
    return pd.NaT

df_meteo = df_meteo.copy()


print("[1/5] Parsing des timestamps...")
df_meteo['timestamp'] = df_meteo['timestamp'].apply(parse_timestamp)
invalid_ts = df_meteo['timestamp'].isna().sum()
print(f"  Timestamps invalides: {invalid_ts}")


df_meteo = df_meteo.dropna(subset=['timestamp'])
df_meteo.info()

[1/5] Parsing des timestamps...
  Timestamps invalides: 0
<class 'pandas.DataFrame'>
RangeIndex: 252612 entries, 0 to 252611
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   commune                  252612 non-null  str           
 1   timestamp                252612 non-null  datetime64[us]
 2   temperature_c            251383 non-null  str           
 3   humidite_pct             252612 non-null  float64       
 4   rayonnement_solaire_wm2  252612 non-null  float64       
 5   vitesse_vent_kmh         252612 non-null  float64       
 6   precipitation_mm         252612 non-null  float64       
dtypes: datetime64[us](1), float64(4), str(2)
memory usage: 16.2 MB


### Convertir les colonnes numériques en gérant les erreurs

In [50]:
print("Valeurs uniques non numeriques dans temperature_c:")
temp_non_numeric = df_meteo[
    ~df_meteo['temperature_c'].astype(str).str.match(r'^-?[0-9]+[.,]?[0-9]*$', na=False)
]['temperature_c'].unique()
print(temp_non_numeric)


df_meteo['temperature_c'] = pd.to_numeric(
    df_meteo['temperature_c'].astype(str).str.replace(',', '.'),
    errors='coerce'
)
df_meteo.info()
print("  Conversions effectuees.")

Valeurs uniques non numeriques dans temperature_c:
<ArrowStringArray>
[nan]
Length: 1, dtype: str
<class 'pandas.DataFrame'>
RangeIndex: 252612 entries, 0 to 252611
Data columns (total 7 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   commune                  252612 non-null  str           
 1   timestamp                252612 non-null  datetime64[us]
 2   temperature_c            251383 non-null  float64       
 3   humidite_pct             252612 non-null  float64       
 4   rayonnement_solaire_wm2  252612 non-null  float64       
 5   vitesse_vent_kmh         252612 non-null  float64       
 6   precipitation_mm         252612 non-null  float64       
dtypes: datetime64[us](1), float64(5), str(1)
memory usage: 15.2 MB
  Conversions effectuees.


### Corriger les valeurs aberrantes
#### Temperatures hors [-40, 50] -> NaN

In [51]:
temp_outliers = ((df_meteo['temperature_c'] < -40) | (df_meteo['temperature_c'] > 50)).sum()
df_meteo.loc[
    (df_meteo['temperature_c'] < -40) | (df_meteo['temperature_c'] > 50),
    'temperature_c'
] = np.nan
print(f"  Temperatures aberrantes -> NaN: {temp_outliers}")


  Temperatures aberrantes -> NaN: 1977


#### Humidité hors [0, 100] -> clipping

In [52]:
humidity_outliers = ((df_meteo['humidite_pct'] < 0) | (df_meteo['humidite_pct'] > 100)).sum()
print(f"  Humidite clippee [0, 100]: {humidity_outliers}")

df_meteo['humidite_pct'] = df_meteo['humidite_pct'].clip(0, 100)


  Humidite clippee [0, 100]: 1801


#### Rayonnement solaire négatif -> 0

In [53]:
ray_neg = (df_meteo['rayonnement_solaire_wm2'] < 0).sum()

df_meteo['rayonnement_solaire_wm2'] = df_meteo['rayonnement_solaire_wm2'].clip(lower=0)

print(f"Rayonnement solaire - Valeurs négatives ramenées à 0 : {ray_neg}")

Rayonnement solaire - Valeurs négatives ramenées à 0 : 1275


### Traiter les valeurs manquantes

#### Interpolation lineaire pour temperature et humidité

In [54]:
for col in ['temperature_c', 'humidite_pct']:
    before_na = df_meteo[col].isna().sum()
    
    df_meteo[col] = df_meteo[col].interpolate(method='linear', limit_direction='both')
    
    after_na = df_meteo[col].isna().sum()
    print(f"  {col}: {before_na} -> {after_na} NaN (interpolés: {before_na - after_na})")

  temperature_c: 3206 -> 0 NaN (interpolés: 3206)
  humidite_pct: 0 -> 0 NaN (interpolés: 0)


#### Forward fill pour précipitations

In [65]:
print(df_meteo['precipitation_mm'].dtype)

print(df_meteo['precipitation_mm'].value_counts(dropna=False).head(20))

before_na = (df_meteo['precipitation_mm'].isna() | (df_meteo['precipitation_mm'] == '')).sum()
df_meteo['precipitation_mm'] = df_meteo['precipitation_mm'].replace('', np.nan)
df_meteo['precipitation_mm'] = df_meteo.groupby('commune')['precipitation_mm'].transform(
    lambda x: x.ffill().bfill()
)
after_na = df_meteo['precipitation_mm'].isna().sum()
print(f"  precipitation_mm: {before_na} -> {after_na} NaN (forward filled)")

float64
precipitation_mm
0.0     189528
7.1        485
11.0       471
13.6       465
8.5        465
5.3        462
8.1        455
0.8        455
8.0        453
5.4        452
9.6        451
5.0        449
9.8        449
11.3       449
13.8       448
4.4        447
11.8       445
9.5        444
12.1       444
14.2       443
Name: count, dtype: int64
  precipitation_mm: 0 -> 0 NaN (forward filled)


### Ajouter des colonnes temporelles (jour, mois, saison, jour de semaine)

In [67]:
df_meteo['date'] = df_meteo['timestamp'].dt.date
df_meteo['hour'] = df_meteo['timestamp'].dt.hour
df_meteo['day_of_week'] = df_meteo['timestamp'].dt.dayofweek
df_meteo['month'] = df_meteo['timestamp'].dt.month
df_meteo['season'] = df_meteo['month'].map({
    12: 'Hiver', 1: 'Hiver', 2: 'Hiver',
    3: 'Printemps', 4: 'Printemps', 5: 'Printemps',
    6: 'Ete', 7: 'Ete', 8: 'Ete',
    9: 'Automne', 10: 'Automne', 11: 'Automne'
})

print("  Colonnes ajoutees: date, hour, day_of_week, month, season")
df_meteo.head(10)

  Colonnes ajoutees: date, hour, day_of_week, month, season


Unnamed: 0,commune,timestamp,temperature_c,humidite_pct,rayonnement_solaire_wm2,vitesse_vent_kmh,precipitation_mm,date,hour,day_of_week,month,season
0,Saint-Etienne,2024-09-15 15:00:00,17.1,100.0,244.9,14.3,0.0,2024-09-15,15,6,9,Automne
1,Bordeaux,2023-07-21 15:00:00,19.6,50.6,414.9,3.2,0.0,2023-07-21,15,4,7,Ete
2,Montpellier,2023-09-18 20:00:00,18.3,65.7,218.4,13.6,0.0,2023-09-18,20,0,9,Automne
3,Le Havre,2024-01-03 22:00:00,3.7,94.9,6.8,18.6,11.6,2024-01-03,22,2,1,Hiver
4,Lille,2024-10-29 20:00:00,14.0,42.9,781.8,4.0,0.0,2024-10-29,20,1,10,Automne
5,Bordeaux,2023-12-22 13:00:00,4.4,36.9,796.4,6.1,0.0,2023-12-22,13,4,12,Hiver
6,Marseille,2023-09-15 21:00:00,22.5,86.8,5.8,32.6,0.0,2023-09-15,21,4,9,Automne
7,Toulouse,2023-05-30 00:00:00,8.3,66.3,26.4,31.4,7.2,2023-05-30,0,1,5,Printemps
8,Bordeaux,2024-10-05 09:00:00,11.5,69.8,71.4,34.4,0.0,2024-10-05,9,5,10,Automne
9,Toulon,2024-09-28 21:00:00,19.2,79.0,13.1,31.8,0.0,2024-09-28,21,5,9,Automne


### Dataset nettoyé

In [69]:
output_path = f"{OUTPUT_DIR}/meteo_clean.csv"
df_meteo.to_csv(output_path, index=False)