In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [15]:
df = pd.read_csv("cars.csv")

In [16]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
symboling            205 non-null int64
normalized-losses    205 non-null object
make                 205 non-null object
fuel-type            205 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
width                205 non-null float64
height               205 non-null float64
engine-type          205 non-null object
engine-size          205 non-null int64
horsepower           205 non-null object
city-mpg             205 non-null int64
highway-mpg          205 non-null int64
price                205 non-null int64
dtypes: float64(2), int64(5), object(8)
memory usage: 24.1+ KB


In [18]:
df["normalized-losses"].replace("?",np.nan,inplace=True)
df["normalized-losses"] = df["normalized-losses"].astype(float)
losses_mean = df["normalized-losses"].mean()
df["normalized-losses"].fillna(losses_mean,inplace=True)

In [19]:
df["horsepower"].replace("?",np.nan,inplace=True)
df["horsepower"] = df["horsepower"].astype(float)
horsepower_mean = df["horsepower"].mean()
df["horsepower"].fillna(horsepower_mean,inplace=True)

In [20]:
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower           float64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [21]:
df_num = df.select_dtypes(["int64","float64"])
df_cat = df.select_dtypes("object")

In [22]:
df_num.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [23]:
df_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc


In [24]:
from sklearn.preprocessing import LabelEncoder

In [25]:
for col in df_cat:
    le = LabelEncoder()
    df_cat[col] = le.fit_transform(df_cat[col])

In [26]:
df_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3


In [27]:
df_new = pd.concat([df_num,df_cat],axis=1)

In [28]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [29]:
X = df_new.drop("price",axis=1)
y = df["price"]

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [32]:
lin = LinearRegression()

lin.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [33]:
lin.score(X_train,y_train)

0.8504573774895475

In [34]:
lin.score(X_test,y_test)

0.7965566780397382

In [35]:
lin.coef_

array([ 4.51384957e+01,  1.53127607e+00,  7.89452171e+02,  3.62663990e+02,
        9.83682875e+01, -1.08169245e+01,  3.08017854e+02, -4.17024371e+02,
       -2.00099087e+02, -6.22650015e+02, -1.70235175e+02,  1.86860719e+03,
        1.64133620e+04,  2.83174279e+02])

In [36]:
#overfitting - to add error
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

In [40]:
r = Ridge(10) #hyperparameter value

In [41]:
r.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

In [42]:
print(r.coef_) # Ridge reduce coef

[ 2.08658930e+02 -5.60173023e-01  3.64420144e+02  5.72916414e+02
  1.04441215e+02  2.21332730e+01  2.11271281e+02 -2.72864381e+02
 -1.86340249e+02 -9.06610516e+02 -6.30655861e+02  1.56860422e+03
  2.57047785e+03  5.15948757e+02]


In [43]:
l = Lasso(1000)

In [44]:
l.fit(X_train, y_train)

Lasso(alpha=1000, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [45]:
print(l.coef_) # Lasso not important value will be 0

[   0.            3.96310671    0.          276.93279232  135.34169218
   16.00129073   -0.         -113.03311963 -126.40695244   -0.
   -0.            0.            0.            0.        ]


In [52]:
for i in range(50):
    r = Ridge(i)
    r.fit(X_train,y_train)
    print(i,": ",r.score(X_test,y_test))

0 :  0.7965566780397381
1 :  0.8074518758147273
2 :  0.8110292248150516
3 :  0.8126933383890033
4 :  0.81361486450293
5 :  0.8141745853539419
6 :  0.8145301242133356
7 :  0.8147582608502814
8 :  0.8149010602831954
9 :  0.8149836949253051
10 :  0.8150222867376526
11 :  0.815027724543179
12 :  0.8150076788279417
13 :  0.8149677381788263
14 :  0.8149120868051174
15 :  0.8148439278252518
16 :  0.8147657584947391
17 :  0.8146795554128126
18 :  0.8145869029046835
19 :  0.8144890843369584
20 :  0.8143871485445552
21 :  0.8142819591129825
22 :  0.814174231578849
23 :  0.8140645619421538
24 :  0.813953448816606
25 :  0.8138413108452037
26 :  0.8137285005403769
27 :  0.8136153153884844
28 :  0.8135020068362638
29 :  0.8133887876197213
30 :  0.8132758377831448
31 :  0.813163309653795
32 :  0.8130513319772296
33 :  0.8129400133729977
34 :  0.8128294452363017
35 :  0.8127197041851856
36 :  0.8126108541327796
37 :  0.812502948048576
38 :  0.8123960294605083
39 :  0.8122901337400301
40 :  0.812185289

In [58]:
for i in range(100, 500, 50):
    l = Lasso(i)
    l.fit(X_train,y_train)
    print(i,"-->",l.score(X_test,y_test))

100 --> 0.8089989416007809
150 --> 0.8124970042062464
200 --> 0.8139201358023782
250 --> 0.812421909078023
300 --> 0.8085057299003375
350 --> 0.8036053753129061
400 --> 0.7977229768452246
450 --> 0.7950465607641612


In [55]:
# Final Models

l1 = Lasso(150)
l2 = Ridge(2)

# Cross Validation

In [59]:
from sklearn.model_selection import cross_val_score

In [61]:
cross_l1 = cross_val_score(l1,X,y,cv=4)

In [62]:
cross_l1 

array([0.74881662, 0.82838485, 0.41889822, 0.46178504])

In [63]:
np.mean(cross_l1)

0.6144711830674416

In [64]:
cross_l2 = cross_val_score(l2,X,y,cv=4)

In [65]:
cross_l2

array([0.71176474, 0.86474228, 0.37640664, 0.47020196])

In [66]:
np.mean(cross_l2)

0.6057789059244746