# Nettoyage

In [265]:
# TODO:
# Finir nettoyage model
# Levy ?
#  Certains model nom doublé ex (x-trail x-trail)
#  Alphabet géorgien à extraire des modèles
#  Suppr des classes pas suffisement représentés ?
#  Vérifier le type des series

In [266]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from cleaning_func import clean_manufacturer, clean_mileage, clean_price, clean_engine_volume, clean_doors, clean_model, clean_cylinders, clean_wheel_v2, clean_leather_interior_v2

In [267]:
df = pd.read_csv('./original.csv')

In [268]:
# Suppression des doublons
df.drop_duplicates(keep='first', inplace=True)

In [269]:
# Colonnes enlevés: ID ; À nettoyer et ajouter : Levy, Model
df = df[['Price', 'Manufacturer', 'Prod. year', 'Category', 'Leather interior',
         'Fuel type', 'Engine volume', 'Mileage', 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color','Airbags']]

In [270]:
df = clean_manufacturer(df, ['სხვა', 'TESLA'],
                        'Manufacturer')  # Manufacturer => 3 lignes
df = clean_price(df, min=500, max=1000000)  # Price => 1664 lignes
df = clean_mileage(df, min=500, max=1000000)  # Mileage => 677 lignes
df = clean_engine_volume(df, 0.5, 8)  # Engine volume => 28 lignes
df = clean_cylinders(df, min=3.0, max=12)  # Cylinders => 37 lignes
df = clean_doors(df)  # Doors => 0 lignes
# df = clean_model(df)  # Model => 0 lignes
df = clean_wheel_v2(df)  # Wheel => 0 lignes
df = clean_leather_interior_v2(df)  # Leather interior => 0 lignes

In [271]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(df.head())

   Price Manufacturer  Prod. year   Category Leather interior Fuel type  Engine volume  Mileage  Cylinders Gear box type Drive wheels Doors        Wheel   Color  Airbags  Turbo
0  13328        LEXUS        2010       Jeep          leather    Hybrid            3.5   186005        6.0     Automatic          4x4     5   Left_wheel  Silver       12  False
1  16621    CHEVROLET        2011       Jeep       no_leather    Petrol            3.0   192000        6.0     Tiptronic          4x4     5   Left_wheel   Black        8  False
2   8467        HONDA        2006  Hatchback       no_leather    Petrol            1.3   200000        4.0      Variator        Front     5  Right_wheel   Black        2  False
3   3607         FORD        2011       Jeep          leather    Hybrid            2.5   168966        4.0     Automatic          4x4     5   Left_wheel   White        0  False
4  11726        HONDA        2014  Hatchback          leather    Petrol            1.3    91901        4.0     Auto

# Encodage

In [272]:
# Encodage choisi
one_hot_encoder = OneHotEncoder(sparse_output=False) # Possible avec sparse_output ? avantage ?

# Colonnes à encoder (Manque model)
columns_to_one_hot_encode = ['Gear box type','Leather interior','Manufacturer',
                 'Category', 'Fuel type', 'Drive wheels', 'Doors',
                   'Wheel', 'Color', 'Turbo']

# Créer la transformation des colonnes
column_transformer = ColumnTransformer([('encoder', one_hot_encoder, columns_to_one_hot_encode)], remainder='passthrough')

# Appliquer la transformation aux données
data_one_hot_encoded = column_transformer.fit_transform(df[columns_to_one_hot_encode])

# Créer un nouveau dataframe avec les colonnes encodées
columns_encoded = column_transformer.get_feature_names_out()
data_encoded = pd.DataFrame(data_one_hot_encoded, columns=columns_encoded)

# Afficher les premières lignes du dataframe encodé pour vérification
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)
print(data_encoded.head())

   encoder__Gear box type_Automatic  encoder__Gear box type_Manual  encoder__Gear box type_Tiptronic  encoder__Gear box type_Variator  encoder__Leather interior_leather  encoder__Leather interior_no_leather  encoder__Manufacturer_ACURA  encoder__Manufacturer_ALFA ROMEO  encoder__Manufacturer_ASTON MARTIN  encoder__Manufacturer_AUDI  encoder__Manufacturer_BENTLEY  encoder__Manufacturer_BMW  encoder__Manufacturer_BUICK  encoder__Manufacturer_CADILLAC  encoder__Manufacturer_CHEVROLET  encoder__Manufacturer_CHRYSLER  encoder__Manufacturer_CITROEN  encoder__Manufacturer_DAEWOO  encoder__Manufacturer_DAIHATSU  encoder__Manufacturer_DODGE  encoder__Manufacturer_FERRARI  encoder__Manufacturer_FIAT  encoder__Manufacturer_FORD  encoder__Manufacturer_GAZ  encoder__Manufacturer_GMC  encoder__Manufacturer_HAVAL  encoder__Manufacturer_HONDA  encoder__Manufacturer_HUMMER  encoder__Manufacturer_HYUNDAI  encoder__Manufacturer_INFINITI  encoder__Manufacturer_ISUZU  encoder__Manufacturer_JAGUAR  encoder_

# Normalisation

##### StandardScaler()

In [273]:
# Normaliseur choisi
standard_scaler = StandardScaler()

# Colonne à normaliser (Manque Levy)
columns_to_scale = ['Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']

# Créer la transformation des colonnes
standard_column_transformer = ColumnTransformer([('scaler', standard_scaler, columns_to_scale)],
                                       remainder='passthrough')

# Appliquer la transformation aux données
data_scaled = standard_column_transformer.fit_transform(df[columns_to_scale])

# Créer un nouveau dataframe avec les colonnes normalisées
columns_scaled = standard_column_transformer.get_feature_names_out()
data_scaled = pd.DataFrame(data_scaled, columns=columns_scaled)

print(data_scaled.head())

   scaler__Prod. year  scaler__Engine volume  scaler__Mileage  scaler__Cylinders  scaler__Airbags
0           -0.198149               1.461008         0.426522           1.260359         1.311633
1           -0.013191               0.858793         0.486614           1.260359         0.353711
2           -0.937981              -1.188739         0.566805          -0.485656        -1.083171
3           -0.013191               0.256578         0.255726          -0.485656        -1.562131
4            0.541684              -1.188739        -0.516757          -0.485656        -0.604210
