# Structure of the Data

# Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder

## Connexion to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Home-made functions and classes

## Preprocessing classes

### Preprocess *Ad_table* and *Trim*

In [None]:
class preprocess_Ad_Table_Trim():

  def __init__(self, Ad_table, Trim):

    self.df = Ad_table.rename(columns={' Genmodel_ID' : 'Genmodel_ID'})
    self.df = self.clean_obj_and_NaN()
    self.Trim = Trim
    self.merged_df = self.Merge_Ad_Trim()

  def clean_obj_and_NaN(self):

    """
    Function to fulfill Na's and transform object into numerical data
    """

    df_ = self.df

    # Step 1 : remove strings in object and convert in int or float

    if df_["Engin_size"].dtype == 'object':

        # Engine_size
        df_['Engin_size'] = pd.to_numeric(df_['Engin_size'].str.replace('L', ''), errors='coerce')

        # Speed
        df_["Top_speed"] = pd.to_numeric(df_['Top_speed'].str.replace(' mph', ''), errors='coerce')

        # Average_mpg
        df_['Average_mpg'] = pd.to_numeric(df_['Average_mpg'].str.replace(' mpg', ''), errors='coerce')

        # Runned_Miles (convert into numeric only)
        df_['Runned_Miles'] = pd.to_numeric(df_['Runned_Miles'], errors='coerce')


    # Step 2 : Fulfill Na's cells

    df_['Engine_power'] = df_.groupby(['Genmodel_ID'])['Engine_power'].transform(lambda x: x.fillna(x.mean()))

    df_['Top_speed'] = df_.groupby(['Genmodel_ID'])['Top_speed'].transform(lambda x: x.fillna(x.mean()))

    df_['Average_mpg'] = df_.groupby(['Genmodel_ID'])['Average_mpg'].transform(lambda x: x.fillna(x.mean()))

    df_['Engin_size'] = df_.groupby(['Genmodel_ID'])['Engin_size'].transform(lambda x: x.fillna(x.mean()))


    return df_


  def encoding(self):

    #df_ = self.clean_obj_and_NaN()

    df_ = self.df

    # Step 3 : Encode categorical data

    # Define categorical variables to encode
    categorical_var = ['Bodytype', 'Gearbox', 'Fuel_type', 'Seat_num', 'Door_num']

    for column in range(len(categorical_var)):
      label_encoder = LabelEncoder()
      df_[categorical_var[column]] = label_encoder.fit_transform(df_[categorical_var[column]])

    return df_


  def Merge_Ad_Trim(self):

    Trim = self.Trim

    df_ = self.df

    # Step 4 : Merge data

    Trim = Trim.groupby(['Genmodel_ID'])['Gas_emission'].mean()

    df_ = df_.merge(Trim, on='Genmodel_ID', how='left')

    return df_

  def select_columns(self,
                     df_,
                     columns_to_drop=[],
                     columns_to_keep=[]):
    """
    Select columns to keep for the output. Run before droping Na's
    """


    if len(columns_to_drop) != 0:

      df_ = df_.drop(columns=columns_to_drop)

    elif len(columns_to_keep) != 0:

      df_ = df_[columns_to_keep]

    else:

      pass

    return df_


  def get_full_data(self,
                    drop_nan = False,
                    columns_to_drop = [],
                    columns_to_keep = []):
    """
    Function to prepare the output as a dataframe for descriptive statistics.
    """

    df_ = self.select_columns(self.merged_df, columns_to_drop, columns_to_keep)

    if drop_nan == True :
      df_ = df_.dropna()

    else :

      pass

    return df_


  def final_set(self,
                columns_to_drop = ['Maker',
                                  ' Genmodel',
                                  'Genmodel_ID',
                                  'Adv_ID',
                                  'Adv_year',
                                  'Adv_month',
                                  'Reg_year',
                                  'Annual_Tax',
                                  'Color'],
                columns_to_keep = []):

    """
    Function to prepare the data to be trained.
    Can chose the Columns to drop
    """

    df_ = self.select_columns(self.merged_df, columns_to_drop)

    df_ = df_.dropna()

    y = df_['Price']

    X = df_.drop(columns = 'Price')

    return y, X

### Preprocess *Images*

# Data importation

# Descriptive statistics

# Machine learning models