## Add Imports

In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## Load files and pre-process them

### – Training data

In [2]:
# Load training data using pandas
df_train = pd.read_csv('data_train.csv')

In [3]:
# Check for null values
df_train.isnull().sum()

manufacturer_name         0
transmission              0
color                     0
odometer_value            0
year_produced             0
engine_fuel               0
engine_type               0
engine_capacity          15
body_type                 0
has_warranty              0
ownership                 0
type_of_drive             0
is_exchangeable           0
number_of_photos          0
number_of_maintenance     0
duration_listed           0
price_usd                 0
dtype: int64

* The column **engine_capacity** has 15 fields with no values, so we fill them with the median **engine_capacity** value

In [4]:
# Get the median
e_cap_median = df_train.engine_capacity.median()

# Fill all NaN fields in engine_capacity
df_train.engine_capacity = df_train.engine_capacity.fillna(e_cap_median)

In [5]:
df_train.isnull().sum()

manufacturer_name        0
transmission             0
color                    0
odometer_value           0
year_produced            0
engine_fuel              0
engine_type              0
engine_capacity          0
body_type                0
has_warranty             0
ownership                0
type_of_drive            0
is_exchangeable          0
number_of_photos         0
number_of_maintenance    0
duration_listed          0
price_usd                0
dtype: int64

* Encode all non-numeric columns to numeric values

In [6]:
# Get data information
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   manufacturer_name      50000 non-null  object 
 1   transmission           50000 non-null  object 
 2   color                  50000 non-null  object 
 3   odometer_value         50000 non-null  int64  
 4   year_produced          50000 non-null  int64  
 5   engine_fuel            50000 non-null  object 
 6   engine_type            50000 non-null  object 
 7   engine_capacity        50000 non-null  float64
 8   body_type              50000 non-null  object 
 9   has_warranty           50000 non-null  bool   
 10  ownership              50000 non-null  object 
 11  type_of_drive          50000 non-null  object 
 12  is_exchangeable        50000 non-null  bool   
 13  number_of_photos       50000 non-null  int64  
 14  number_of_maintenance  50000 non-null  int64  
 15  du

In [8]:
# Encode using LabelEncoder
# Encode 'manufacturer_name'
df_train.manufacturer_name = LabelEncoder().fit_transform(df_train.manufacturer_name)


# Encode 'transmission'
df_train.transmission = LabelEncoder().fit_transform(df_train.transmission)


# Encode 'color'
df_train.color = LabelEncoder().fit_transform(df_train.color)


# Encode 'engine_fuel'
df_train.engine_fuel = LabelEncoder().fit_transform(df_train.engine_fuel)


# Encode 'engine_type'
df_train.engine_type = LabelEncoder().fit_transform(df_train.engine_type)


# Encode 'body_type'
df_train.body_type = LabelEncoder().fit_transform(df_train.body_type)


# Encode 'has_warranty'
df_train.has_warranty = LabelEncoder().fit_transform(df_train.has_warranty)


# Encode 'ownership'
df_train.ownership = LabelEncoder().fit_transform(df_train.ownership)


# Encode 'type_of_drive'
df_train.type_of_drive = LabelEncoder().fit_transform(df_train.type_of_drive)


# Encode 'is_exchangeable'
df_train.is_exchangeable = LabelEncoder().fit_transform(df_train.is_exchangeable)

df_train

Unnamed: 0,manufacturer_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_type,engine_capacity,body_type,has_warranty,ownership,type_of_drive,is_exchangeable,number_of_photos,number_of_maintenance,duration_listed,price_usd
0,48,0,0,130000,2016,0,0,1.6,10,0,2,1,1,17,38,67,13150.0
1,39,1,2,149000,2012,3,2,1.6,8,0,2,1,0,9,3,100,7500.0
2,23,0,2,110000,2014,3,2,1.6,2,0,2,1,0,5,10,91,12200.0
3,35,0,6,255100,2007,3,2,1.8,2,0,2,1,0,10,4,91,4950.0
4,30,1,1,650000,1999,3,2,2.0,8,0,2,1,1,5,7,62,3000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,35,1,10,380000,1996,3,2,1.6,8,0,2,1,0,2,2,59,3500.0
49996,48,1,3,311213,1994,2,2,1.8,10,0,2,1,0,15,7,29,2850.0
49997,13,1,7,250000,1999,3,2,1.8,1,0,2,1,0,7,13,108,2000.0
49998,3,0,4,615000,1998,0,0,2.5,10,0,2,2,1,10,26,64,5080.0


### – Testing data

In [9]:
# Load testing data
df_test = pd.read_csv('data_test.csv')

In [10]:
# Check for null values
df_test.isnull().sum()

manufacturer_name        0
transmission             0
color                    0
odometer_value           0
year_produced            0
engine_fuel              0
engine_type              0
engine_capacity          5
body_type                0
has_warranty             0
ownership                0
type_of_drive            0
is_exchangeable          0
number_of_photos         0
number_of_maintenance    0
duration_listed          0
price_usd                0
dtype: int64

* The column **engine_capacity** has 5 fields with no values, so we fill it with the median **engine_capacity** value of the "**TRAINING DATASET**"

In [11]:
# Fill all NaN fields in engine_capacity
df_test.engine_capacity = df_test.engine_capacity.fillna(e_cap_median)

In [12]:
df_test.isnull().sum()

manufacturer_name        0
transmission             0
color                    0
odometer_value           0
year_produced            0
engine_fuel              0
engine_type              0
engine_capacity          0
body_type                0
has_warranty             0
ownership                0
type_of_drive            0
is_exchangeable          0
number_of_photos         0
number_of_maintenance    0
duration_listed          0
price_usd                0
dtype: int64

* Encode all non-numeric columns to numeric values

In [13]:
# Encode using LabelEncoder
# Encode 'manufacturer_name'
df_test.manufacturer_name = LabelEncoder().fit_transform(df_test.manufacturer_name)


# Encode 'transmission'
df_test.transmission = LabelEncoder().fit_transform(df_test.transmission)


# Encode 'color'
df_test.color = LabelEncoder().fit_transform(df_test.color)


# Encode 'engine_fuel'
df_test.engine_fuel = LabelEncoder().fit_transform(df_test.engine_fuel)


# Encode 'engine_type'
df_test.engine_type = LabelEncoder().fit_transform(df_test.engine_type)


# Encode 'body_type'
df_test.body_type = LabelEncoder().fit_transform(df_test.body_type)


# Encode 'has_warranty'
df_test.has_warranty = LabelEncoder().fit_transform(df_test.has_warranty)


# Encode 'ownership'
df_test.ownership = LabelEncoder().fit_transform(df_test.ownership)


# Encode 'type_of_drive'
df_test.type_of_drive = LabelEncoder().fit_transform(df_test.type_of_drive)


# Encode 'is_exchangeable'
df_test.is_exchangeable = LabelEncoder().fit_transform(df_test.is_exchangeable)

df_test

Unnamed: 0,manufacturer_name,transmission,color,odometer_value,year_produced,engine_fuel,engine_type,engine_capacity,body_type,has_warranty,ownership,type_of_drive,is_exchangeable,number_of_photos,number_of_maintenance,duration_listed,price_usd
0,3,0,10,115000,2012,3,2,4.4,8,0,2,0,1,32,104,146,20450.0
1,31,1,6,500000,1999,0,0,2.2,8,0,2,2,0,7,9,147,2600.0
2,13,1,8,210000,2002,3,2,1.2,2,0,2,1,1,16,7,27,2900.0
3,33,0,9,294000,2000,0,0,3.2,9,0,2,0,1,10,2,48,7500.0
4,35,0,1,244000,1998,3,2,1.6,8,0,2,1,0,9,10,116,2200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26995,48,1,9,92000,1993,3,2,2.0,2,0,2,1,0,13,11,65,3333.0
26996,19,0,8,135185,2009,2,2,3.5,8,0,2,0,0,8,2,85,8500.0
26997,48,1,10,450000,1993,3,2,1.8,8,0,2,1,0,6,1,35,1100.0
26998,2,1,4,275000,2006,3,2,2.0,8,0,2,1,0,7,28,115,6300.0


## Split the features from the label/"price_usd"

In [14]:
# Split training dataset
features_train = df_train.drop(['price_usd'],axis=1)
label_train = df_train.price_usd

# Split testing dataset
features_test = df_test.drop(['price_usd'],axis=1)
label_test = df_test.price_usd

## 1. Linear Regression Model

### Model Training

In [16]:
# Create a linear regression model
linReg = LinearRegression()
linReg.fit(features_train, label_train)

LinearRegression()

### Model Evaluation

In [17]:
# Get the predictions
test_prediction = linReg.predict(features_test)

    1. Mean Absolute Error:

In [19]:
mae = mean_absolute_error(test_prediction, label_test)
mae

2449.474660881621

    2. R square:

In [21]:
errScore = r2_score(test_prediction, label_test)
errScore

0.48778975854439466

## Notes:
1. Since the price of a car is continous and dependeant on some independant features, a linear regression model would be a suitable choice.
2. Eventhough when looking at it, the number of photos shouldn't add any value to the car being sold, but after a bit of testing it turned out including the number of photos does minimize the error margin in both metrics used above.