In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/content/cars.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   width              205 non-null    float64
 8   height             205 non-null    float64
 9   engine-type        205 non-null    object 
 10  engine-size        205 non-null    int64  
 11  horsepower         205 non-null    object 
 12  city-mpg           205 non-null    int64  
 13  highway-mpg        205 non-null    int64  
 14  price              205 non-null    int64  
dtypes: float64(2), int64(5), object(8)
memory usage: 24.1+ KB


In [6]:
df.replace('?', np.nan, inplace =True)

In [7]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [8]:
df.isna().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [9]:
#imputeing nan with mean

from sklearn.impute import SimpleImputer
si = SimpleImputer()

df.loc[:,['normalized-losses', 'horsepower']] = si.fit_transform(df.loc[:,['normalized-losses', 'horsepower']])
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [10]:
#Applying Ordinal encoding on all the categorical columns

from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()


In [11]:
#Selecting Categorical Columns

cat_cols = df.select_dtypes(object).columns
cat_cols

Index(['make', 'fuel-type', 'body-style', 'drive-wheels', 'engine-location',
       'engine-type'],
      dtype='object')

In [12]:
df[cat_cols] = oe.fit_transform (df[cat_cols])
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,13495
1,3,122.0,0.0,1.0,0.0,2.0,0.0,64.1,48.8,0.0,130,111.0,21,27,16500
2,1,122.0,0.0,1.0,2.0,2.0,0.0,65.5,52.4,5.0,152,154.0,19,26,16500
3,2,164.0,1.0,1.0,3.0,1.0,0.0,66.2,54.3,3.0,109,102.0,24,30,13950
4,2,164.0,1.0,1.0,3.0,0.0,0.0,66.4,54.3,3.0,136,115.0,18,22,17450


**Feature Scaling**

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge

In [14]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [16]:
sc = StandardScaler()
X = sc.fit_transform(X)

In [17]:
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,test_size =0.2,random_state=1)

**Linear Regression**

In [19]:
lr = LinearRegression()
lr.fit(xtrain,ytrain)
ypred = lr.predict(xtest)

print (f"Training accuracy :", lr.score(xtrain,ytrain))
print (f"Testing accuracy :", lr.score(xtest,ytest))

Training accuracy : 0.8539412613914049
Testing accuracy : 0.7440263258545714


In [20]:
lr.coef_

array([  100.70152137,  -157.4653771 , -1212.01831145,  -117.77994561,
        -136.52459077,  1128.78608008,  1941.04219971,  1738.41814208,
         698.58953363,   320.35950293,  4041.59046649,  -492.16608863,
        1553.21174218, -2450.66724917])

**Ridge Regression**

In [24]:
rg = Ridge()
rg.fit(xtrain,ytrain)
rgpred = rg.predict(xtest)

print (f"Training accuracy :", rg.score(xtrain,ytrain))
print (f"Testing accuracy :", rg.score(xtest,ytest))

Training accuracy : 0.853793795232896
Testing accuracy : 0.7468034822961305


In [25]:
rg.coef_

array([   75.92706967,  -158.17628146, -1206.42216987,  -162.04093081,
        -156.48856711,  1121.67903706,  1905.61194134,  1719.20463905,
         713.53938244,   328.91482672,  3990.47650541,  -442.90460517,
        1164.75979456, -2082.05211597])

**Lasso Regression**

In [26]:
ls = Lasso()
ls.fit(xtrain,ytrain)
lspred = ls.predict(xtest)

print (f"Training accuracy :", ls.score(xtrain,ytrain))
print (f"Testing accuracy :", ls.score(xtest,ytest))

Training accuracy : 0.8539397391560521
Testing accuracy : 0.7443240787367613


In [27]:
ls.coef_

array([   96.09444864,  -155.11949836, -1210.42544506,  -121.19910051,
        -136.9606948 ,  1125.95047422,  1938.2903356 ,  1735.22990129,
         698.10371721,   319.73213183,  4041.72530326,  -489.85761144,
        1512.68920683, -2412.56609442])

**Selection of alpha**

In [43]:
for i in range(2500,2550):
  rg = Ridge(alpha =i)
  rg.fit(xtrain,ytrain)
  y = rgpred = rg.predict(xtest)
  print(f"{i} {rg.score(xtrain,ytrain)} {rg.score(xtest,ytest)}")

2500 0.30150050236557635 0.2687639890612853
2501 0.3014181734623109 0.26868236580036575
2502 0.3013358888549096 0.26860078612983773
2503 0.3012536485082925 0.2685192500155653
2504 0.30117145238741627 0.2684377574234468
2505 0.30108930045727345 0.2683563083194157
2506 0.30100719268289244 0.26827490266943954
2507 0.30092512902933777 0.2681935404395205
2508 0.3008431094617101 0.2681122215956946
2509 0.3007611339451456 0.2680309461040328
2510 0.30067920244481683 0.26794971393064027
2511 0.3005973149259318 0.26786852504165615
2512 0.30051547135373435 0.26778737940325426
2513 0.3004336716935041 0.2677062769816423
2514 0.30035191591055654 0.2676252177430619
2515 0.3002702039702423 0.2675442016537892
2516 0.30018853583794813 0.26746322868013417
2517 0.3001069114790962 0.267382298788441
2518 0.30002533085914385 0.2673014119450875
2519 0.2999437939435844 0.26722056811648576
2520 0.29986230069794617 0.2671397672690814
2521 0.2997808510877932 0.26705900936935434
2522 0.2996994450787246 0.266978294

In [44]:
for i in range(1500,1600):
  ls = Lasso(alpha =i)
  ls.fit(xtrain,ytrain)
  y = rgpred = ls.predict(xtest)
  print(f"{i} {ls.score(xtrain,ytrain)} {ls.score(xtest,ytest)}")

1500 0.7472950532074247 0.6828774456383304
1501 0.7471946197757003 0.6828079737359083
1502 0.7470941194145466 0.6827384503667657
1503 0.7469935521239637 0.682668875530903
1504 0.7468929179039512 0.6825992492283199
1505 0.7467922167545095 0.6825295714590164
1506 0.7466914486756384 0.6824598422229925
1507 0.7465906136673381 0.6823900615202483
1508 0.7464897117296085 0.6823202293507837
1509 0.7463887428624496 0.6822503457145987
1510 0.7462877070658612 0.6821804106116934
1511 0.7461866043398437 0.6821104240420678
1512 0.7460854346843968 0.6820403860057218
1513 0.7459841980995205 0.6819702965026553
1514 0.745882894585215 0.6819001555328685
1515 0.7457815241414802 0.6818299630963616
1516 0.745680086768316 0.681759719193134
1517 0.7455785824657225 0.6816894238231862
1518 0.7454770112336996 0.6816190769865181
1519 0.7453753730722475 0.6815486786831295
1520 0.7452736679813661 0.6814782289130206
1521 0.7451718959610553 0.6814077276761913
1522 0.7450700570113152 0.6813371749726416
1523 0.74496815