In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

# Linear Regression model of car prices.

In [60]:
""" Reading csv file."""
car_price_df = pd.read_csv('CarPrice.csv')
targets = car_price_df['price']
car_price_df.drop(['price'], axis=1, inplace=True)
car_price_df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22


## Preapring dataset
1. encode nominal and ordinal data
2. normalize data

### Encoding:

In [61]:
"""Clean CarName column and apply binary encoding."""

car_price_df['CarName'] = car_price_df['CarName'].apply(lambda x: str(x).split()[0])
cleanup_nums = {"CarName": { "maxda": "mazda" , "porcshce": "porsche" , "Nissan": "nissan" , "vokswagen": "volkswagen", "toyouta" : "toyota", "vw" : "volkswagen"} }
car_price_df.replace(cleanup_nums, inplace=True)
bi_encoder = ce.BinaryEncoder(cols=['CarName'])
car_price_df = bi_encoder.fit_transform(car_price_df)

In [62]:
"""Encode fueltype column."""

car_price_df['fueltype'] = (car_price_df['fueltype'] == 'gas').astype(int)

In [63]:
"""Encode aspiration column."""

car_price_df['aspiration'] = (car_price_df['aspiration'] == 'std').astype(int)

In [64]:
"""Encode doornumber column"""
car_price_df.replace({'doornumber': {'two': 2, 'four' : 4}}, inplace= True)

In [65]:
"""Apply binary encoding to carbody, drivewheel, enginetype and fuelsystem columns"""
bi_encoder = ce.BinaryEncoder(cols=['carbody', 'drivewheel', 'enginetype', 'fuelsystem'])
car_price_df = bi_encoder.fit_transform(car_price_df)

In [66]:
"""Encode enginelocation column"""
car_price_df['enginelocation'] = (car_price_df['enginelocation'] == 'front').astype(int)

In [67]:
"""Encode cylindernumber column"""
cylinders = {'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'eight': 8, 'twelve': 12 }
car_price_df.replace({'cylindernumber': cylinders}, inplace=True)

### Normalizing:

In [68]:
std_scaler = StandardScaler()
df_normalized = pd.DataFrame(std_scaler.fit_transform(car_price_df), columns=car_price_df.columns)
df_normalized.drop(['car_ID'], axis=1, inplace=True)


In [69]:
"""Split dataset for train and test parts."""
X_train, X_test, y_train, y_test = train_test_split(df_normalized, targets, test_size=0.2, random_state=0)

## Regression:

In [70]:
def print_metrics(y_actual, y_pred):
      print(f'Mean Absolute Error: {metrics.mean_absolute_error(y_true=y_actual, y_pred=y_pred)}')
      print(f'Mean Squared Error: {metrics.mean_squared_error(y_true=y_actual, y_pred=y_pred)}')
      print(f'Root Mean Squared Error: {np.sqrt(metrics.mean_absolute_error(y_true=y_actual, y_pred=y_pred))}')
      print(f'Coefficient of determination {metrics.r2_score(y_true=y_actual, y_pred=y_pred)}')

In [71]:
regression = LinearRegression()
regression.fit(X_train, y_train)


In [72]:
y_pred = regression.predict(X_test)
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print_metrics(y_test, y_pred)

Mean Absolute Error: 2244.059009926839
Mean Squared Error: 12800749.995202485
Root Mean Squared Error: 47.37149997547934
Coefficient of determination 0.8346505791648887
