In [22]:
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

In [23]:
df = pd.read_csv('car_price.csv')

In [24]:
df.head(5)

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [25]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [26]:
col_change = list(df.dtypes[df.dtypes == object].index)

for i in col_change:
    df[i] = df[i].str.lower().str.replace(' ', '_')

# Function for binary encoding to lessen the burden

In [40]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg',
        'city_mpg', 'popularity']
# args: dataframe, column, most freq. values
def convert_binary(df, col, idx):
    df = df.copy()
    features = base.copy()
    
    max_len = len(df[col].value_counts())
    cols = df[col].value_counts().head(min(idx, max_len))
    cols_list = cols.index.to_list()

    for i in cols_list:
        feature = f"col={i}"
        df[feature] = (df[col] == i).astype(int)
        features.append(feature)
    
    #return df, features

# for col in list(df.dtypes[df.dtypes == object].index):
#     convert_binary(df, col, 5)
convert_binary(df, 'make', 5)

# Let's try to substitute our manual
# regression with Sklearn's one

In [27]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [28]:
df.isnull().sum()
df = df.fillna(0)
#df = df.loc[:, df.dtypes == 'float64'].astype(int)

In [29]:
full_train, test = train_test_split(df, test_size=0.2, random_state=1)
train, valid = train_test_split(full_train, test_size=0.33, random_state=11)

In [30]:
y_tr = np.log1p(train.msrp.values)
y_vl = np.log1p(valid.msrp.values)

del train['msrp']
del valid['msrp']

In [31]:
df.dtypes[df.dtypes == 'object']

make                 object
model                object
engine_fuel_type     object
transmission_type    object
driven_wheels        object
market_category      object
vehicle_size         object
vehicle_style        object
dtype: object

In [32]:
full_list_2 = df.dtypes[df.dtypes == 'object'].index.to_list()
full_list_2

['make',
 'model',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']

In [33]:
a = full_train.dtypes[full_train.dtypes == 'float64'].index.to_list()
b = full_train.dtypes[full_train.dtypes == 'int64'].index.to_list()
full_list = (a + b)
full_list.remove('msrp')
# full_list.remove('year')
full_list

['engine_hp',
 'engine_cylinders',
 'number_of_doors',
 'year',
 'highway_mpg',
 'city_mpg',
 'popularity']

In [34]:
full_train[full_list].corrwith(df.msrp)

engine_hp           0.648103
engine_cylinders    0.527243
number_of_doors    -0.125982
year                0.225401
highway_mpg        -0.157651
city_mpg           -0.156620
popularity         -0.048147
dtype: float64

In [35]:
# we put numeric & categorical features in separate lists
numeric = full_list
categoric = full_list_2

In [36]:
categoric.remove('model')
categoric

['make',
 'engine_fuel_type',
 'transmission_type',
 'driven_wheels',
 'market_category',
 'vehicle_size',
 'vehicle_style']

# Transformation & Regression

# ! When doing `rmse` y_valid is compared to y_pred with X_valid
# and y_train to y_pred with X_train

In [37]:
train_dict = train[numeric + categoric].to_dict(orient='records')

In [38]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)

In [39]:
model = Ridge()
model.fit(X_train, y_tr)

Ridge()

In [40]:
valid_dict = valid[numeric + categoric].to_dict(orient='records')
X_valid = dv.transform(valid_dict)

In [41]:
y_pred = model.predict(X_valid)
y_pred
# every figure is prediciton for the car

array([10.27674838, 10.8430193 , 10.47009842, ..., 10.65039757,
       10.48229819,  9.69865271])

In [42]:
def rmse(y, y_pred):
    # diff. between predicted and actual values + sum()
    # PS: we don't have sum() as we do `elementwise` operation
    # PS2: for each element of y_pred, we subtract the corresponding element of y
    error = y_pred - y
    # squared and `/m`
    mse = (error ** 2).mean()
    # root. is taken
    return np.sqrt(mse)

rmse(y_vl, y_pred)

0.45054941776707147

In [53]:
# don't forget to remove `msrp`
d = [{'make': 'bmw',
 'model': '1_series',
 'year': 2012,
 'engine_fuel_type': 'premium_unleaded_(required)',
 'engine_hp': 300.0,
 'engine_cylinders': 6.0,
 'transmission_type': 'manual',
 'driven_wheels': 'rear_wheel_drive',
 'number_of_doors': 2.0,
 'market_category': 'luxury,high-performance',
 'vehicle_size': 'compact',
 'vehicle_style': 'coupe',
 'highway_mpg': 28,
 'city_mpg': 20,
 'popularity': 3916}]

In [54]:
X_test = dv.transform(d)

In [55]:
res = model.predict(X_test)[0]
np.expm1(res)
# 'msrp': 39300

40959.22445350503