# Rihal Data Science Challenge

You are allowed to use any method in reading, loading, transforming the data. Additionally, you can use any model to predict the price of used cars. 

**Make sure** you explain your approach at each step, and to evaluate your model. Finally, you must show the Mean Asbolute Error from your model's predictions when compared with the test dataset's prices. 

---------------------------

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.options.display.max_columns = 110

## Read Data

In [2]:
df_train = pd.read_csv('data_train.csv')
df_test = pd.read_csv('data_test.csv')

In [3]:
display(df_train.head())
display(df_test.head())

Unnamed: 0,manufacturer_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_type,engine_capacity,body_type,has_warranty,ownership,type_of_drive,is_exchangeable,number_of_photos,number_of_maintenance,duration_listed,price_usd
0,Volkswagen,automatic,black,130000,2016,diesel,diesel,1.6,universal,False,owned,front,True,17,38,67,13150.0
1,Renault,manual,brown,149000,2012,gasoline,gasoline,1.6,sedan,False,owned,front,False,9,3,100,7500.0
2,Kia,automatic,brown,110000,2014,gasoline,gasoline,1.6,hatchback,False,owned,front,False,5,10,91,12200.0
3,Opel,automatic,other,255100,2007,gasoline,gasoline,1.8,hatchback,False,owned,front,False,10,4,91,4950.0
4,Mazda,manual,blue,650000,1999,gasoline,gasoline,2.0,sedan,False,owned,front,True,5,7,62,3000.0


Unnamed: 0,manufacturer_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_type,engine_capacity,body_type,has_warranty,ownership,type_of_drive,is_exchangeable,number_of_photos,number_of_maintenance,duration_listed,price_usd
0,BMW,automatic,white,115000,2012,gasoline,gasoline,4.4,sedan,False,owned,all,True,32,104,146,20450.0
1,Mercedes-Benz,manual,other,500000,1999,diesel,diesel,2.2,sedan,False,owned,rear,False,7,9,147,2600.0
2,Fiat,manual,silver,210000,2002,gasoline,gasoline,1.2,hatchback,False,owned,front,True,16,7,27,2900.0
3,Mitsubishi,automatic,violet,294000,2000,diesel,diesel,3.2,suv,False,owned,all,True,10,2,48,7500.0
4,Opel,automatic,blue,244000,1998,gasoline,gasoline,1.6,sedan,False,owned,front,False,9,10,116,2200.0


## Number of unique values for each column

In [4]:
# we need to check the number of unique values in each column
# just to have an idea of the structure of the data and to identify 
# categorical variables.
df_train.nunique()

manufacturer_name          55
transmission                2
color                      12
odometer_value           5455
year_produced              63
engine_fuel                 6
engine_type                 3
engine_capacity            61
body_type                  12
has_warranty                2
ownership                   3
type_of_drive               3
is_exchangeable             2
number_of_photos           60
number_of_maintenance     365
duration_listed           760
price_usd                2495
dtype: int64

In [5]:
df_test.nunique()

manufacturer_name          55
transmission                2
color                      12
odometer_value           3915
year_produced              63
engine_fuel                 5
engine_type                 3
engine_capacity            59
body_type                  12
has_warranty                2
ownership                   3
type_of_drive               3
is_exchangeable             2
number_of_photos           54
number_of_maintenance     311
duration_listed           654
price_usd                2034
dtype: int64

### Categorical variables to Numeric variables

In [11]:
# first we encode the categorical independent variables
# that is, has_warranty, is_exchangeable, transmission, colour,
# manufacturer_name etc...
# we do this by creating dummy variables for each value and concatenating 
# them to the current data 


# defining categorical variables
categorical_columns = ['manufacturer_name', 'transmission', 'engine_fuel', 'engine_type', 
                       'has_warranty', 'is_exchangeable', 'type_of_drive', 'color', 'body_type',
                       'ownership']

df_encode_train = pd.get_dummies(data = df_train, prefix = 'OHE', prefix_sep='_',
               columns = categorical_columns,
               drop_first = True,
              dtype='int8')

In [10]:
display(df_encode_train)

Unnamed: 0,odometer_value,year_produced,engine_capacity,number_of_photos,number_of_maintenance,duration_listed,price_usd,OHE_Alfa Romeo,OHE_Audi,OHE_BMW,OHE_Buick,OHE_Cadillac,OHE_Chery,OHE_Chevrolet,OHE_Chrysler,OHE_Citroen,OHE_Dacia,OHE_Daewoo,OHE_Dodge,OHE_Fiat,OHE_Ford,OHE_Geely,OHE_Great Wall,OHE_Honda,OHE_Hyundai,OHE_Infiniti,OHE_Iveco,OHE_Jaguar,OHE_Jeep,OHE_Kia,OHE_LADA,OHE_Lancia,OHE_Land Rover,OHE_Lexus,OHE_Lifan,OHE_Lincoln,OHE_Mazda,OHE_Mercedes-Benz,OHE_Mini,OHE_Mitsubishi,OHE_Nissan,OHE_Opel,OHE_Peugeot,OHE_Pontiac,OHE_Porsche,OHE_Renault,OHE_Rover,OHE_Saab,OHE_Seat,OHE_Skoda,OHE_SsangYong,OHE_Subaru,OHE_Suzuki,OHE_Toyota,OHE_Volkswagen,OHE_Volvo,OHE_ВАЗ,OHE_ГАЗ,OHE_ЗАЗ,OHE_Москвич,OHE_УАЗ,OHE_manual,OHE_electric,OHE_gas,OHE_gasoline,OHE_hybrid-diesel,OHE_hybrid-petrol,OHE_electric.1,OHE_gasoline.1,OHE_True,OHE_True.1,OHE_front,OHE_rear,OHE_blue,OHE_brown,OHE_green,OHE_grey,OHE_orange,OHE_other,OHE_red,OHE_silver,OHE_violet,OHE_white,OHE_yellow,OHE_coupe,OHE_hatchback,OHE_liftback,OHE_limousine,OHE_minibus,OHE_minivan,OHE_pickup,OHE_sedan,OHE_suv,OHE_universal,OHE_van,OHE_new,OHE_owned
0,130000,2016,1.6,17,38,67,13150.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
1,149000,2012,1.6,9,3,100,7500.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2,110000,2014,1.6,5,10,91,12200.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,255100,2007,1.8,10,4,91,4950.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
4,650000,1999,2.0,5,7,62,3000.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,380000,1996,1.6,2,2,59,3500.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
49996,311213,1994,1.8,15,7,29,2850.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
49997,250000,1999,1.8,7,13,108,2000.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
49998,615000,1998,2.5,10,26,64,5080.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
