In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
#To load dataset
df=pd.read_csv("automobile_data.csv")
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


In [3]:
#How many rows and columns
df.shape

(205, 15)

In [4]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower            object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [5]:
df['normalized-losses'].unique()

array(['?', '164', '158', '192', '188', '121', '98', '81', '118', '148',
       '110', '145', '137', '101', '78', '106', '85', '107', '104', '113',
       '150', '129', '115', '93', '142', '161', '153', '125', '128',
       '122', '103', '168', '108', '194', '231', '119', '154', '74',
       '186', '83', '102', '89', '87', '77', '91', '134', '65', '197',
       '90', '94', '256', '95'], dtype=object)

In [6]:
df['normalized-losses'].value_counts()

?      41
161    11
91      8
150     7
134     6
128     6
104     6
85      5
94      5
65      5
102     5
74      5
168     5
103     5
95      5
106     4
93      4
118     4
148     4
122     4
83      3
125     3
154     3
115     3
137     3
101     3
119     2
87      2
89      2
192     2
197     2
158     2
81      2
188     2
194     2
153     2
129     2
108     2
110     2
164     2
145     2
113     2
256     1
107     1
90      1
231     1
142     1
121     1
78      1
98      1
186     1
77      1
Name: normalized-losses, dtype: int64

In [7]:
#Replace "?" with null values permanently
df['normalized-losses'].replace("?",np.nan,inplace=True)

In [8]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            0
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [9]:
#Replace "?" with null values permanently
df['horsepower'].replace("?",np.nan,inplace=True)

In [10]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [11]:
#to check datatypes
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower            object
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [12]:
#To change datatypes of normalized-losses column and horsepower  column
#use inbuilt method astype()
df["normalized-losses"]=df["normalized-losses"].astype("float")
df["horsepower"]=df["horsepower"].astype("float")


In [13]:
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower           float64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [14]:
df.isnull().sum()

symboling             0
normalized-losses    41
make                  0
fuel-type             0
body-style            0
drive-wheels          0
engine-location       0
width                 0
height                0
engine-type           0
engine-size           0
horsepower            2
city-mpg              0
highway-mpg           0
price                 0
dtype: int64

In [15]:
df.shape  #205/2=102.5

(205, 15)

In [16]:
#fill null values of normalized-losses column with mean/average of normalized-losses column
#permanently
#use inbuilt method fillna()
#find the mean of normalized-losses column
m=df["normalized-losses"].mean()
df["normalized-losses"].fillna(m,inplace=True)

In [17]:
#fill null values of horsepower column with mean/average of horsepower  column
#permanently
#use inbuilt method fillna()
#find the mean of horsepower column
m=df["horsepower"].mean()
df["horsepower"].fillna(m,inplace=True)

In [18]:
#check null values
df.isnull().sum()

symboling            0
normalized-losses    0
make                 0
fuel-type            0
body-style           0
drive-wheels         0
engine-location      0
width                0
height               0
engine-type          0
engine-size          0
horsepower           0
city-mpg             0
highway-mpg          0
price                0
dtype: int64

In [19]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [20]:
df["fuel-type"].unique()

array(['gas', 'diesel'], dtype=object)

In [21]:
df["fuel-type"].value_counts()

gas       185
diesel     20
Name: fuel-type, dtype: int64

In [22]:
df["make"].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: make, dtype: int64

In [23]:
df.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
body-style            object
drive-wheels          object
engine-location       object
width                float64
height               float64
engine-type           object
engine-size            int64
horsepower           float64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [24]:
#Seaparate Numerical type data and object type data  and store into new dataset
df_num=df.select_dtypes(["int64","float64"])
df_cat=df.select_dtypes(object)

In [25]:
df_num.dtypes

symboling              int64
normalized-losses    float64
width                float64
height               float64
engine-size            int64
horsepower           float64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

In [26]:
df_cat.dtypes

make               object
fuel-type          object
body-style         object
drive-wheels       object
engine-location    object
engine-type        object
dtype: object

In [27]:
#apply LabelEncoder on df_cat why? converts object type data into number
from sklearn.preprocessing import LabelEncoder
for col in df_cat:
    #print(col)
    #create the object of LabelEncoder class
    le=LabelEncoder()
    df_cat[col]=le.fit_transform(df[col])

In [28]:
df_cat.dtypes

make               int32
fuel-type          int32
body-style         int32
drive-wheels       int32
engine-location    int32
engine-type        int32
dtype: object

In [29]:
df_num.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [30]:
df_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3


In [31]:
#To join both dataframe df_num and df_cat  and hold in new dataframe 
df_new=pd.concat([df_num,df_cat],axis=1)
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


In [32]:
df_new.dtypes

symboling              int64
normalized-losses    float64
width                float64
height               float64
engine-size            int64
horsepower           float64
city-mpg               int64
highway-mpg            int64
price                  int64
make                   int32
fuel-type              int32
body-style             int32
drive-wheels           int32
engine-location        int32
engine-type            int32
dtype: object

In [33]:
#price : output variable (contineous numeric value)
#this is regression problem
#select input
X=df_new.drop("price",axis=1) 
#select output
Y=df_new["price"]

In [34]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [35]:
#Apply StandardScaler on input X-train,X_test
from sklearn.preprocessing import StandardScaler
#create the object of StandardScaler class
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

In [36]:
#Train the model with 70% data using LinearRegression
from sklearn.linear_model import LinearRegression
#create object of LinearRegression class
lr=LinearRegression()

In [37]:
#train the model
lr.fit(X_train,Y_train)

In [38]:
#check coefficient/slope of each input 
slope=lr.coef_.round(2)
slope

array([   57.82,    44.48,  1700.08,   918.28,  4046.26,  -450.44,
        1968.72, -2781.47, -1253.88,  -179.  ,  -149.86,  1058.52,
        2352.26,   309.65])

In [39]:
X.columns

Index(['symboling', 'normalized-losses', 'width', 'height', 'engine-size',
       'horsepower', 'city-mpg', 'highway-mpg', 'make', 'fuel-type',
       'body-style', 'drive-wheels', 'engine-location', 'engine-type'],
      dtype='object')

In [40]:
dict={'Input':X.columns,'Slope':slope}
df1=pd.DataFrame(dict)
df1

Unnamed: 0,Input,Slope
0,symboling,57.82
1,normalized-losses,44.48
2,width,1700.08
3,height,918.28
4,engine-size,4046.26
5,horsepower,-450.44
6,city-mpg,1968.72
7,highway-mpg,-2781.47
8,make,-1253.88
9,fuel-type,-179.0


In [41]:
#Check intercept 
lr.intercept_

13483.55944055944

In [42]:
#Test the model with 30% data
Y_pred=lr.predict(X_test)

In [43]:
#find mean squared error 
#Evaluation  of model
#find mean squared error , call inbuilt class mean_squared_error
from sklearn.metrics import mean_squared_error
mse=mean_squared_error(Y_test,Y_pred)
print("mean_squared_error : ",mse)

mean_squared_error :  12259459.688784512


In [44]:
#or find r2_score or tesing error
from sklearn.metrics import r2_score
r2_score(Y_test,Y_pred)

0.7965566780397383

In [45]:
#or find the testing score , use inbuilt method score of LinearRegression class
lr.score(X_test,Y_test)

0.7965566780397383

In [46]:
#check model is overfit or underfit ?
#find the training score
lr.score(X_train,Y_train)

0.8504573774895473

In [47]:
#Here training score>testing score
#0.85>0.79  difference =6 means model is overfit
#how to take care of overfitting 
#use Regularisation technique :
#Regularisation means to add external error on training time
'''
There are 2 types of Regularisation :- 
1. L1 means LASSO Regularisation:
RSS error+lambda*sum(abs(all slope))
#here value of lambda 200-1000   200 250 300
2. L2 means RIDGE Regularisation
RSS error+lambda*sum(square(all slope))
#here value of lambda 0-50
'''

'\nThere are 2 types of Regularisation :- \n1. L1 means LASSO Regularisation:\nRSS error+lambda*sum(abs(all slope))\n#here value of lambda 200-1000   200 250 300\n2. L2 means RIDGE Regularisation\nRSS error+lambda*sum(square(all slope))\n#here value of lambda 0-50\n'

In [48]:
#Lasso Regularisation : inbuilt class Lasso which define in outer class linear_model
from sklearn.linear_model import Lasso

In [49]:
#Create object of Lasso class
#l1=Lasso(pass the value of lambda) 200-1000
#how to select right value of lambda or alpha or hyper-parameter  
'''for i in range(200,1001,50): #i=200  i<1001 and i=i+50
    #create the object of Lasso class
    l1=Lasso(i) #RSS+lambda*sum(abs(all slope))
    #train the model with Lasso 
    l1.fit(X_train,Y_train)
    #check testing score 
    score_test=l1.score(X_test,Y_test)
    #check training score
    score_train=l1.score(X_train,Y_train)
    print("Value of lambda : ",i)
    print("Training Score : {} and Testing Score : {}".format(score_train,score_test))'''

'for i in range(200,1001,50): #i=200  i<1001 and i=i+50\n    #create the object of Lasso class\n    l1=Lasso(i) #RSS+lambda*sum(abs(all slope))\n    #train the model with Lasso \n    l1.fit(X_train,Y_train)\n    #check testing score \n    score_test=l1.score(X_test,Y_test)\n    #check training score\n    score_train=l1.score(X_train,Y_train)\n    print("Value of lambda : ",i)\n    print("Training Score : {} and Testing Score : {}".format(score_train,score_test))'

In [50]:
#clear value of lambda =200 in Lasso
l1=Lasso(200) #RSS+lambda*sum(abs(all slope))
#train the model with Lasso 
l1.fit(X_train,Y_train)
#check testing score 
score_test=l1.score(X_test,Y_test)
#check training score
score_train=l1.score(X_train,Y_train)
print("Training Score : {} and Testing Score : {}".format(score_train,score_test))

Training Score : 0.8410262816516455 and Testing Score : 0.8111395722615142


In [51]:
#check slope 
dict={'Input':X.columns,'Slope After Lasso':l1.coef_}
df1=pd.DataFrame(dict)
df1

Unnamed: 0,Input,Slope After Lasso
0,symboling,-0.0
1,normalized-losses,0.0
2,width,1526.47716
3,height,676.382157
4,engine-size,3934.38934
5,horsepower,-0.0
6,city-mpg,-0.0
7,highway-mpg,-740.167611
8,make,-983.399245
9,fuel-type,-206.481439


In [52]:
#Ridge Regularisation
#inbuilt class Ridge 
from sklearn.linear_model import Ridge
'''for i in range(1,51): #start i=1 i<51 i=i+1
    #create the object of Ridge class
    l2=Ridge(i)#RSS+lambda*sum(square(all slope))
    #train the model with Ridge
    l2.fit(X_train,Y_train)
    #check testing score 
    score_test=l2.score(X_test,Y_test)
    #check training score
    score_train=l2.score(X_train,Y_train)
    print("Value of lambda : ",i)
    print("Training Score : {} and Testing Score : {}".format(score_train,score_test))
    '''

'for i in range(1,51): #start i=1 i<51 i=i+1\n    #create the object of Ridge class\n    l2=Ridge(i)#RSS+lambda*sum(square(all slope))\n    #train the model with Ridge\n    l2.fit(X_train,Y_train)\n    #check testing score \n    score_test=l2.score(X_test,Y_test)\n    #check training score\n    score_train=l2.score(X_train,Y_train)\n    print("Value of lambda : ",i)\n    print("Training Score : {} and Testing Score : {}".format(score_train,score_test))\n    '

In [53]:
#create the object of Ridge class
l2=Ridge(5)#RSS+lambda*sum(square(all slope))
#train the model with Ridge
l2.fit(X_train,Y_train)
#check testing score 
score_test=l2.score(X_test,Y_test).round(2)
#check training score
score_train=l2.score(X_train,Y_train).round(2)

print("Training Score : {} and Testing Score : {}".format(score_train,score_test))

Training Score : 0.85 and Testing Score : 0.81


In [54]:
#check slope 
dict={'Input':X.columns,'Slope After Ridge':l2.coef_,'slope After Lasso':l1.coef_,
      'Slope Linear':lr.coef_}
df1=pd.DataFrame(dict)
df1

Unnamed: 0,Input,Slope After Ridge,slope After Lasso,Slope Linear
0,symboling,-44.171206,-0.0,57.818957
1,normalized-losses,31.699889,0.0,44.478852
2,width,1618.173944,1526.47716,1700.081393
3,height,936.967988,676.382157,918.279407
4,engine-size,3696.673717,3934.38934,4046.255188
5,horsepower,-133.517568,-0.0,-450.43882
6,city-mpg,631.124652,-0.0,1968.724789
7,highway-mpg,-1549.113672,-740.167611,-2781.473799
8,make,-1213.138453,-983.399245,-1253.87858
9,fuel-type,-343.592727,-206.481439,-178.999294


In [55]:
#in lasso : score : 0.81
#in ridge : score : 0.81

In [56]:
#cross validation (if testing score of both regularisation are same then use cross validation )
#use inbuilt class cross_val_score used for cross validation 
#cross validation is used to decide Lasso is best or Ridge is best in given dataset 
from sklearn.model_selection import cross_val_score

In [57]:
#create object of Lasso class
l1=Lasso(200) #200 is the value of lambda

In [58]:
#create the object of cross_val_score class  for Lasso
l1_cross=cross_val_score(l1,X,Y,cv=4) #here passing the parameter of object of Lasso class l1,
#input X and output Y and no. of partition cv=4


In [59]:
#check score for each partition 
l1_cross

array([0.76560829, 0.81872367, 0.43344753, 0.448364  ])

In [61]:
#Find the average or mean of each part 
np.mean(l1_cross).round(2)

0.62

In [62]:
#same cross validation for ridge regularisation
#create object of Ridge class
l2=Ridge(5) #5 is the value of lambda

In [63]:
#create the object of cross_val_score class  for Ridge
l2_cross=cross_val_score(l2,X,Y,cv=4) #here passing the parameter of object of Lasso class l1,
#input X and output Y and no. of partition cv=4

In [64]:
#check score for each partition 
l2_cross

array([0.72743414, 0.86422682, 0.3890122 , 0.45959976])

In [65]:
#Find the average or mean of each part 
np.mean(l2_cross).round(2)

0.61

In [None]:
#mean of Lasso=0.62
#and mean of Ridge =0.61 
#here mean of Lasso>Mean of Ridge 
#means Lasso regularisation is better tha Ridge for this dataset
#score =0.81 means 81%