In [26]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv("cars.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [28]:
df["normalized-losses"].replace("?", np.nan, inplace=True)
df["horsepower"].replace("?", np.nan, inplace=True)

In [29]:
from sklearn.impute import SimpleImputer
# strategy is we are going to use mean of these cols.
si = SimpleImputer(missing_values=np.nan, strategy="mean") 
# fit_transform ,transforms cols into numeric datatype first,then calc mean of the col,fill mean value
# inthe place of nan.
# Now,since there is no inplace in fit_transform, we will do reassigment of cols.
df[["normalized-losses", "horsepower"]] = si.fit_transform(df[["normalized-losses", "horsepower"]])

In [30]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [34]:
# we only want names of cols which have categories in it.
# we will save these names in catcol. 
catcol = df.select_dtypes(object).columns
df[catcol].head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc


In [37]:
from sklearn.preprocessing import OrdinalEncoder
# ohe because it will give many cols
# labelencoding because itismeant for target variable. So,we are left with ordinal encoder.
# The features are converted to ordinal integers. 
# This results in a single column of integers (0 to n_categories - 1) per feature.
oe = OrdinalEncoder()
# reassigning data since there is no inplace parameter.
df[catcol] =oe.fit_transform(df[catcol])
df[catcol].head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0.0,1.0,0.0,2.0,0.0,0.0
1,0.0,1.0,0.0,2.0,0.0,0.0
2,0.0,1.0,2.0,2.0,0.0,5.0
3,1.0,1.0,3.0,1.0,0.0,3.0
4,1.0,1.0,3.0,0.0,0.0,3.0


In [8]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,13495
1,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,16500
2,1,122.0,0.0,1.0,2.0,2.0,0.0,65.5,52.4,5.0,152,154.0,19,26,16500
3,2,164.0,1.0,1.0,3.0,1.0,0.0,66.2,54.3,3.0,109,102.0,24,30,13950
4,2,164.0,1.0,1.0,3.0,0.0,0.0,66.4,54.3,3.0,136,115.0,18,22,17450


In [9]:

x = df.iloc[:, :-1] # slicing All rows,all cols except last
y = df.iloc[:, -1] # indexing all rows of last col

In [10]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.3, random_state=1)

In [11]:
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
linreg.fit(xtrain, ytrain)
ypred = linreg.predict(xtest)

In [12]:
from sklearn.metrics import r2_score
print(r2_score(ytest, ypred))
# accuracy of model is 79

0.796556678039738


In [13]:
# checking accuracy of training andtesting both to know,if model is underfitting or overfitting
train = linreg.score(xtrain, ytrain)
test = linreg.score(xtest, ytest)

print(f"Training Results -: {train}")
print(f"Testing Results -: {test}")

Training Results -: 0.8504573774895472
Testing Results -: 0.796556678039738


In [14]:
#High Accuracy -> low Training Error -> low bias
# comparing testing error with training error--
#Low Accuracy -> High Testing Error -> High Variance

#Low Bias + High Variance => Overfitting

In [15]:
linreg.coef_

array([ 4.51384957e+01,  1.53127607e+00, -2.00099087e+02, -6.22650015e+02,
       -1.70235175e+02,  1.86860719e+03,  1.64133620e+04,  7.89452171e+02,
        3.62663990e+02,  2.83174279e+02,  9.83682875e+01, -1.08169245e+01,
        3.08017854e+02, -4.17024371e+02])

In [16]:
# Implementing regularization. i.e. adding some eternal error to mse or cost function. 
# As compared to lasso, ridge will add more error because it's alpha*m^2.So it might give us best fit at earliest.

from sklearn.linear_model import Ridge, Lasso
l2 = Ridge(alpha=10) # ridge minimizes coeffi to some value and lasso minimize to 0.
# currently we have taken alpha = some random value. Later,wewill see how to get this ideal value.
l2.fit(xtrain, ytrain)
ypred = l2.predict(xtest)

In [17]:
train = l2.score(xtrain, ytrain)
test = l2.score(xtest, ytest)

print(f"Training Results -: {train}")
print(f"Testing Results -: {test}")

Training Results -: 0.8109538582620313
Testing Results -: 0.8150222867376528


Both results are same means low bias low variance scenari.

In [18]:
l2.coef_ # values of coeff are reduced. earlier, it was 4.51384957e+01,  1.53127607e+00, -2.00099087e+02,etc

array([ 2.08658930e+02, -5.60173023e-01, -1.86340249e+02, -9.06610516e+02,
       -6.30655861e+02,  1.56860422e+03,  2.57047785e+03,  3.64420144e+02,
        5.72916414e+02,  5.15948757e+02,  1.04441215e+02,  2.21332730e+01,
        2.11271281e+02, -2.72864381e+02])

In [19]:
# Now, seeing L1
l1 = Lasso(alpha=10)
l1.fit(xtrain, ytrain)
ypred = l1.predict(xtest)

In [20]:
train = l1.score(xtrain, ytrain)
test = l1.score(xtest, ytest)

print(f"Training Results -: {train}")
print(f"Testing Results -: {test}")

Training Results -: 0.850322100477304
Testing Results -: 0.7982450097819802


In [21]:
l1.coef_

array([ 4.30275855e+01,  1.54304858e+00, -1.98340491e+02, -5.37967579e+02,
       -1.81289126e+02,  1.83640473e+03,  1.57130274e+04,  7.69644647e+02,
        3.68604731e+02,  2.83851156e+02,  9.92737590e+01, -9.63869876e+00,
        2.96678686e+02, -4.03230505e+02])

Now, alpha is a hyper parameter which can be changed or adjusted by the developer.But what should be ideal alpha value? So that requires hyper-parameter tuning.

# Hyperparameter Tuning

In [22]:
# basically we want same training and testing accuracy.For i value between 150 to 200,it'scoming close
# u can try for 1to 50 firsst, then 50 to 100 ..150 to200 

for i in range(150, 200):
    l1 = Lasso(alpha=i)
    l1.fit(xtrain, ytrain) # tuning our model on different values of alpha
    train = l1.score(xtrain, ytrain)
    test = l1.score(xtest, ytest)
    print(f"{i}  {train}  {test}")  # for value of i looking for score of training and testing  

# around at i=168 we are getting both of them same.

150  0.8212288879856349  0.8124969899539802
151  0.8208421468238061  0.8125456274822552
152  0.8204526518423789  0.8125934728624998
153  0.8200607762157636  0.8126404383406126
154  0.8196661304388356  0.8126866131583909
155  0.8192691148773523  0.812731907864298
156  0.8188695301369727  0.8127763675886549
157  0.818467171522371  0.8128200339758355
158  0.818062449862035  0.8128628223230776
159  0.8176549464501084  0.8129048166168775
160  0.8172450881050913  0.8129459336049878
161  0.8168324400154192  0.8129862557319264
162  0.8164174452457786  0.8130257013815376
163  0.8159998820785825  0.813064312010476
164  0.8155795285150105  0.813102124478007
165  0.8151568303042498  0.8131390636923392
166  0.8147313271293926  0.8131752048184052
167  0.8143034914835354  0.8132104730535141
168  0.8138728445987573  0.8132449418027806
169  0.8134398734696936  0.8132785388258221
170  0.8130040832573875  0.8133113351310719
171  0.8125659771958963  0.8133432609587987
172  0.8121250442517014  0.8133743847

In [23]:
l1 = Lasso(alpha=168) # therefore training for i=168
l1.fit(xtrain, ytrain)
ypred = l1.predict(xtest)

In [24]:
train = l1.score(xtrain, ytrain)
test = l1.score(xtest, ytest)

print(f"Training Results -: {train}")
print(f"Testing Results -: {test}")

Training Results -: 0.8138728445987573
Testing Results -: 0.8132449418027806


U can try for ridge as well. Try ti figure out if we can increase the value from 81% to 85% using ridge or lasso.