#### Introduction au data processing

##### Créons un jeu de données simplifié :


In [68]:
import pandas as pd
import numpy as np
np.random.seed(42)
# Données simplifiées
n_samples = 500
prix = np.random.normal(20000, 7000, n_samples)
kms = np.random.normal(90000, 30000, n_samples)
annee = np.random.randint(2012, 2023, n_samples)
marques = ['Toyota', 'Renault', 'BMW', 'Audi']
# Création du dataset
data = pd.DataFrame({
 'prix': prix,
 'kilometrage': kms,
 'annee': annee,
 'marque': np.random.choice(marques, n_samples)
})
# Ajout de valeurs manquantes et aberrantes
data.loc[np.random.choice(n_samples, 20), 'kilometrage'] =np.nan
data.loc[np.random.choice(n_samples, 10), 'prix'] = data['prix'] * 4
print(data.head())

           prix    kilometrage  annee   marque
0  23476.999071  117785.326426   2018      BMW
1  19032.149892  147282.499214   2012      BMW
2  24533.819767   48042.972785   2019     Audi
3  30661.208995  106889.077101   2022  Renault
4  18360.926377   70480.722926   2014      BMW


#### 1. Affichez les statistiques descriptives des variables.


In [81]:
print(data.describe())

               prix    kilometrage        annee  marque_Audi  marque_BMW  \
count    489.000000     489.000000   489.000000   489.000000  489.000000   
mean   20100.302583   90553.145244  2016.920245     0.255624    0.229039   
std     6636.260374   28618.493386     3.082170     0.436658    0.420645   
min     1661.784271    9093.400712  2012.000000     0.000000    0.000000   
25%    15149.632833   72790.139794  2014.000000     0.000000    0.000000   
50%    20171.571220   90708.759718  2017.000000     0.000000    0.000000   
75%    24533.819767  108654.298867  2020.000000     1.000000    0.000000   
max    39041.184166  168971.461945  2022.000000     1.000000    1.000000   

       marque_Renault  marque_Toyota  
count      489.000000     489.000000  
mean         0.259714       0.255624  
std          0.438927       0.436658  
min          0.000000       0.000000  
25%          0.000000       0.000000  
50%          0.000000       0.000000  
75%          1.000000       1.000000  
max

#### 2. Identifiez les colonnes avec des valeurs manquantes.

In [70]:
print(data.isna())

      prix  kilometrage  annee  marque
0    False        False  False   False
1    False        False  False   False
2    False        False  False   False
3    False        False  False   False
4    False        False  False   False
..     ...          ...    ...     ...
495  False        False  False   False
496  False        False  False   False
497  False        False  False   False
498  False        False  False   False
499  False        False  False   False

[500 rows x 4 columns]


In [71]:
print(data.isna().sum())

prix            0
kilometrage    18
annee           0
marque          0
dtype: int64


#### 1. Remplacez les valeurs manquantes de la colonne kilometrage par la moyenne


In [72]:
data_filled = data["kilometrage"].fillna(data["kilometrage"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_filled = data["kilometrage"].fillna(data["kilometrage"].mean(), inplace=True)


In [73]:
print(data.isna().sum())

prix           0
kilometrage    0
annee          0
marque         0
dtype: int64


#### 2. Imputez les valeurs manquantes en utilisant la médiane.


In [74]:
from sklearn.impute import SimpleImputer

# imputer = SimpleImputer(strategy= "median")
# data['kilometrage'] = imputer.fit(data[['kilometrage']])

#### 3. Gestion des valeurs aberrantes
#### 1. Détectez les valeurs aberrantes dans la colonne prix avec la méthode IQR.


In [79]:
q1 = data["prix"].quantile(0.25)
q3 = data["prix"].quantile(0.75)
IQR = q3 - q1

lower_bound = q1 - 1.5 * IQR
upper_bound = q3 + 1.5 * IQR

outliers = data[(data["prix"] < lower_bound) | (data["prix"] > upper_bound)] 

print(f"Borne inférieure : {lower_bound}, Borne supérieure : {upper_bound}")
print("---------------------------------------------------------------")

print(f"Les valeurs abérantes sont : {outliers}")
print("---------------------------------------------------------------")
print("Nombres de valeurs abérantes : ", outliers.shape[0])

Borne inférieure : 1073.3524328812764, Borne supérieure : 38610.10016699899
---------------------------------------------------------------
Les valeurs abérantes sont :              prix   kilometrage   annee   marque  marque_Audi  marque_BMW  \
179  39041.184166  128146.65285  2020.0  Renault          0.0         0.0   

     marque_Renault  marque_Toyota  
179             1.0            0.0  
---------------------------------------------------------------
Nombres de valeurs abérantes :  1


##### 2. Supprimez les valeurs aberrantes identifiées.


In [76]:
data = data[~data.index.isin(outliers.index)]
print("Nombres de valeurs sans abérantes : ", data.shape[0])


Nombres de valeurs sans abérantes :  489


#### 4. Encodage des variables catégorielles
##### 1. Encodez la colonne marque avec un encodage one-hot

In [77]:
from sklearn.preprocessing import OneHotEncoder
# Code à compléter
encoder = OneHotEncoder()
encoded_marque = encoder.fit_transform(data[['marque']]).toarray()
data_encoded = pd.DataFrame(encoded_marque, columns=encoder.get_feature_names_out(['marque']))
data = pd.concat([data, data_encoded], axis=1)


#### 1. Créez un pipeline pour appliquer toutes les étapes ci-dessus.


In [78]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_features = ["prix", "kilometrage"]
numeric_transformer = Pipeline(steps=[
 ('imputer', SimpleImputer(strategy='median')),
 ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer( 
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
data_transformed = pipeline.fit_transform(data)

print(data_transformed)


[[ 5.14803445e-01  9.63065909e-01]
 [-1.63162925e-01  2.00636439e+00]
 [ 6.75998765e-01 -1.50368208e+00]
 [ 1.61060052e+00  5.77671673e-01]
 [-2.65543669e-01 -7.10071423e-01]
 [-2.65526140e-01 -5.36566085e-01]
 [ 1.67058705e+00 -6.48264765e-01]
 [ 8.03851750e-01 -9.36451614e-01]
 [-5.16795866e-01  3.17999587e-02]
 [ 5.63753093e-01 -9.01392732e-01]
 [-5.10329136e-01  2.67291668e-01]
 [-5.12797723e-01 -7.29923393e-02]
 [ 2.42804911e-01 -2.73229283e-01]
 [-2.05834709e+00 -9.82686078e-01]
 [-1.85723258e+00 -6.31687893e-01]
 [-6.15892441e-01  7.81847412e-01]
 [-1.09693774e+00  5.11829286e-01]
 [ 3.19983657e-01 -1.05695297e+00]
 [-9.85035283e-01  8.57143149e-02]
 [-1.52345454e+00  5.38290885e-03]
 [ 1.54933474e+00 -1.79106342e+00]
 [-2.56599469e-01  5.56864845e-01]
 [ 5.65616929e-02 -7.22784463e-01]
 [-1.53674151e+00  5.38290885e-03]
 [-5.96775485e-01 -8.29566992e-01]
 [ 1.02893869e-01 -1.93481560e+00]
 [-1.24445377e+00 -1.74664346e+00]
 [ 3.85594542e-01  3.13366036e-02]
 [-6.56839968e-01  2