# Nettoyage

In [43]:
# TODO:
# Factoriser encodage et normalisation
# Finir nettoyage model
# Levy ?
#  Certains model nom doublé ex (x-trail x-trail)
#  Alphabet géorgien à extraire des modèles
#  Suppr des classes pas suffisement représentés ?
#  Vérifier le type des series

In [44]:
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from cleaning_func import clean_manufacturer, clean_mileage, clean_price, clean_engine_volume, clean_doors, clean_model, clean_cylinders, clean_wheel_v2, clean_leather_interior_v2

In [45]:
df = pd.read_csv('./original.csv')

In [46]:
# Suppression des doublons
df.drop_duplicates(keep='first', inplace=True)

In [47]:
# Colonnes enlevés: ID ; À nettoyer et ajouter : Levy, Model
df = df[['Price', 'Manufacturer', 'Prod. year', 'Category', 'Leather interior',
         'Fuel type', 'Engine volume', 'Mileage', 'Cylinders', 'Gear box type', 'Drive wheels', 'Doors', 'Wheel', 'Color','Airbags']]

In [48]:
df = clean_manufacturer(df, ['სხვა', 'TESLA'],
                        'Manufacturer')  # Manufacturer => 3 lignes
df = clean_price(df, min=500, max=1000000)  # Price => 1664 lignes
df = clean_mileage(df, min=500, max=1000000)  # Mileage => 677 lignes
df = clean_engine_volume(df, 0.5, 8)  # Engine volume => 28 lignes
df = clean_cylinders(df, min=3.0, max=12)  # Cylinders => 37 lignes
df = clean_doors(df)  # Doors => 0 lignes
# df = clean_model(df)  # Model => 0 lignes
df = clean_wheel_v2(df)  # Wheel => 0 lignes
df = clean_leather_interior_v2(df)  # Leather interior => 0 lignes

In [49]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
print(df.head())

   Price Manufacturer  Prod. year   Category Leather interior Fuel type  Engine volume  Mileage  Cylinders Gear box type Drive wheels Doors        Wheel   Color  Airbags  Turbo
0  13328        LEXUS        2010       Jeep          leather    Hybrid            3.5   186005        6.0     Automatic          4x4     5   Left_wheel  Silver       12  False
1  16621    CHEVROLET        2011       Jeep       no_leather    Petrol            3.0   192000        6.0     Tiptronic          4x4     5   Left_wheel   Black        8  False
2   8467        HONDA        2006  Hatchback       no_leather    Petrol            1.3   200000        4.0      Variator        Front     5  Right_wheel   Black        2  False
3   3607         FORD        2011       Jeep          leather    Hybrid            2.5   168966        4.0     Automatic          4x4     5   Left_wheel   White        0  False
4  11726        HONDA        2014  Hatchback          leather    Petrol            1.3    91901        4.0     Auto

In [50]:
# Correspond à mon transformer custom ??
def process(transformer , transformer_type, columns_to_process, df: pd.DataFrame):
    
    # Créer la transformation des colonnes
    column_transformer = ColumnTransformer([(transformer_type, transformer, columns_to_process)], remainder='passthrough')

    # Appliquer la transformation aux données
    data_encoded = column_transformer.fit_transform(df[columns_to_process])

    # Créer un nouveau dataframe avec les colonnes encodées
    columns_encoded = column_transformer.get_feature_names_out()
    data_encoded = pd.DataFrame(data_encoded, columns=columns_encoded)

    return column_transformer, data_encoded


# Encodage

In [51]:
# Encodage choisi
one_hot_encoder = OneHotEncoder(sparse_output=False) # Possible avec sparse_output ? avantage ?

# Colonnes à encoder (Manque model)
columns_to_one_hot_encode = ['Gear box type','Leather interior','Manufacturer',
                 'Category', 'Fuel type', 'Drive wheels', 'Doors',
                   'Wheel', 'Color', 'Turbo']

# Créer la transformation des colonnes
column_transformer = ColumnTransformer([('encoder', one_hot_encoder, columns_to_one_hot_encode)], remainder='passthrough')

# Appliquer la transformation aux données
data_one_hot_encoded = column_transformer.fit_transform(df[columns_to_one_hot_encode])

# Créer un nouveau dataframe avec les colonnes encodées
columns_encoded = column_transformer.get_feature_names_out()
data_encoded = pd.DataFrame(data_one_hot_encoded, columns=columns_encoded)

# Afficher les premières lignes du dataframe encodé pour vérification
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)
print(data_encoded.head())

   encoder__Gear box type_Automatic  encoder__Gear box type_Manual  encoder__Gear box type_Tiptronic  encoder__Gear box type_Variator  encoder__Leather interior_leather  encoder__Leather interior_no_leather  encoder__Manufacturer_ACURA  encoder__Manufacturer_ALFA ROMEO  encoder__Manufacturer_ASTON MARTIN  encoder__Manufacturer_AUDI  encoder__Manufacturer_BENTLEY  encoder__Manufacturer_BMW  encoder__Manufacturer_BUICK  encoder__Manufacturer_CADILLAC  encoder__Manufacturer_CHEVROLET  encoder__Manufacturer_CHRYSLER  encoder__Manufacturer_CITROEN  encoder__Manufacturer_DAEWOO  encoder__Manufacturer_DAIHATSU  encoder__Manufacturer_DODGE  encoder__Manufacturer_FERRARI  encoder__Manufacturer_FIAT  encoder__Manufacturer_FORD  encoder__Manufacturer_GAZ  encoder__Manufacturer_GMC  encoder__Manufacturer_HAVAL  encoder__Manufacturer_HONDA  encoder__Manufacturer_HUMMER  encoder__Manufacturer_HYUNDAI  encoder__Manufacturer_INFINITI  encoder__Manufacturer_ISUZU  encoder__Manufacturer_JAGUAR  encoder_

# Normalisation

##### StandardScaler()

In [52]:
# Normaliseur choisi
standard_scaler = StandardScaler()

# Colonne à normaliser
columns_to_scale = ['Prod. year', 'Engine volume', 'Mileage', 'Cylinders', 'Airbags']

# Créer la transformation des colonnes
standard_column_transformer = ColumnTransformer([('scaler', standard_scaler, columns_to_scale)],
                                       remainder='passthrough')

# Appliquer la transformation aux données
data_scaled = standard_column_transformer.fit_transform(df[columns_to_scale])

# Créer un nouveau dataframe avec les colonnes normalisées
columns_scaled = standard_column_transformer.get_feature_names_out()
data_scaled = pd.DataFrame(data_scaled, columns=columns_scaled)

print(data_scaled.head())

   scaler__Prod. year  scaler__Engine volume  scaler__Mileage  scaler__Cylinders  scaler__Airbags
0           -0.198149               1.461008         0.426522           1.260359         1.311633
1           -0.013191               0.858793         0.486614           1.260359         0.353711
2           -0.937981              -1.188739         0.566805          -0.485656        -1.083171
3           -0.013191               0.256578         0.255726          -0.485656        -1.562131
4            0.541684              -1.188739        -0.516757          -0.485656        -0.604210


##### RobustScaler() (Manque Levy)

In [53]:
column_transformer_robust, data_scaled_robust = process(RobustScaler(), 'scaler', ['Mileage', 'Airbags'], df)
print(data_scaled_robust.head())

   scaler__Mileage  scaler__Airbags
0         0.520801             1.00
1         0.574628             0.50
2         0.646456            -0.25
3         0.367815            -0.50
4        -0.324117             0.00


## Concaténer les transformed dataframe

In [54]:
concatened_transformed_df = pd.concat([data_scaled_robust, data_scaled, data_encoded],axis=1)
print(concatened_transformed_df.head())
print(concatened_transformed_df.info())
print(concatened_transformed_df.columns)

   scaler__Mileage  scaler__Airbags  scaler__Prod. year  scaler__Engine volume  scaler__Mileage  scaler__Cylinders  scaler__Airbags  encoder__Gear box type_Automatic  encoder__Gear box type_Manual  encoder__Gear box type_Tiptronic  encoder__Gear box type_Variator  encoder__Leather interior_leather  encoder__Leather interior_no_leather  encoder__Manufacturer_ACURA  encoder__Manufacturer_ALFA ROMEO  encoder__Manufacturer_ASTON MARTIN  encoder__Manufacturer_AUDI  encoder__Manufacturer_BENTLEY  encoder__Manufacturer_BMW  encoder__Manufacturer_BUICK  encoder__Manufacturer_CADILLAC  encoder__Manufacturer_CHEVROLET  encoder__Manufacturer_CHRYSLER  encoder__Manufacturer_CITROEN  encoder__Manufacturer_DAEWOO  encoder__Manufacturer_DAIHATSU  encoder__Manufacturer_DODGE  encoder__Manufacturer_FERRARI  encoder__Manufacturer_FIAT  encoder__Manufacturer_FORD  encoder__Manufacturer_GAZ  encoder__Manufacturer_GMC  encoder__Manufacturer_HAVAL  encoder__Manufacturer_HONDA  encoder__Manufacturer_HUMMER  

In [55]:
# concatened_transformed_df.to_csv('./cleaned_v2.csv', index=False)

In [56]:
# Mini test
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

X = concatened_transformed_df
y = df['Price']
# y = X['Price'] ?

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

model = SVR(kernel='rbf')

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(mean_squared_error(y_test, y_pred, squared=False))
print(r2_score(y_test, y_pred))

print("y_pred", y_pred)
# print("y_test",y_test)

19332.693025833672
-0.03296598686586938
y_pred [14876.46981641 15436.32821039 15395.96325884 ... 14778.8284793
 15524.08759161 15095.78156571]
