In [2]:
#importing all the important libraries
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [5]:
#checking details of dataset
df = pd.read_csv("Car-mpg- Dataset.csv")

In [6]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type,car_name
0,18.0,8,307.0,130,3504,12.0,70,1,0,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,0,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,0,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,0,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,0,ford torino


In [7]:
df.columns

Index(['mpg', 'cyl', 'disp', 'hp', 'wt', 'acc', 'yr', 'origin', 'car_type',
       'car_name'],
      dtype='object')

In [8]:
df.shape

(398, 10)

In [9]:
#removing irrelevant column
df = df.drop("car_name",axis =1)

In [10]:
df.head()

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,1,0
1,15.0,8,350.0,165,3693,11.5,70,1,0
2,18.0,8,318.0,150,3436,11.0,70,1,0
3,16.0,8,304.0,150,3433,12.0,70,1,0
4,17.0,8,302.0,140,3449,10.5,70,1,0


In [11]:
#providing classifications for origin
df['origin'] = df["origin"].replace({1: 'America' ,2: 'Europe',3:'Asia'})

In [10]:
df.head(20)

Unnamed: 0,mpg,cyl,disp,hp,wt,acc,yr,origin,car_type
0,18.0,8,307.0,130,3504,12.0,70,America,0
1,15.0,8,350.0,165,3693,11.5,70,America,0
2,18.0,8,318.0,150,3436,11.0,70,America,0
3,16.0,8,304.0,150,3433,12.0,70,America,0
4,17.0,8,302.0,140,3449,10.5,70,America,0
5,15.0,8,429.0,198,4341,10.0,70,America,0
6,14.0,8,454.0,220,4354,9.0,70,America,0
7,14.0,8,440.0,215,4312,8.5,70,America,0
8,14.0,8,455.0,225,4425,10.0,70,America,0
9,15.0,8,390.0,190,3850,8.5,70,America,0


In [11]:
#creating dummy variables for the origin column expanding categorical data into binary form.
df = pd.get_dummies(df,columns = ['origin'])

In [13]:
#understanding the dataset
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
mpg,398.0,23.514573,7.815984,9.0,17.5,23.0,29.0,46.6
cyl,398.0,5.454774,1.701004,3.0,4.0,4.0,8.0,8.0
disp,398.0,193.425879,104.269838,68.0,104.25,148.5,262.0,455.0
wt,398.0,2970.424623,846.841774,1613.0,2223.75,2803.5,3608.0,5140.0
acc,398.0,15.56809,2.757689,8.0,13.825,15.5,17.175,24.8
yr,398.0,76.01005,3.697627,70.0,73.0,76.0,79.0,82.0
car_type,398.0,0.530151,0.499718,0.0,0.0,1.0,1.0,1.0
origin_America,398.0,0.625628,0.484569,0.0,0.0,1.0,1.0,1.0
origin_Asia,398.0,0.198492,0.399367,0.0,0.0,0.0,0.0,1.0
origin_Europe,398.0,0.175879,0.381197,0.0,0.0,0.0,0.0,1.0


In [15]:
# creating a DataFrame 't' with boolean values indicating whether each entry in the 'hp' column is a digit
t = pd.DataFrame(df.hp.str.isdigit())

In [16]:
type(t)

pandas.core.frame.DataFrame

In [17]:
t

Unnamed: 0,hp
0,True
1,True
2,True
3,True
4,True
...,...
393,True
394,True
395,True
396,True


In [18]:
t[t['hp']==False]

Unnamed: 0,hp
32,False
126,False
330,False
336,False
354,False
374,False


In [19]:
df['hp'][126]

'?'

In [20]:
df = df.replace('?',np.nan)

In [21]:
df['hp'][126]

nan

In [22]:
df.describe()

Unnamed: 0,mpg,cyl,disp,wt,acc,yr,car_type,origin_America,origin_Asia,origin_Europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.530151,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.499718,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0,1.0


In [23]:
#  filling missing values in each column of DataFrame 'df' with the median value of that column
df = df.apply(lambda x: x.fillna(x.median()), axis = 0)

In [24]:
df.describe()

Unnamed: 0,mpg,cyl,disp,wt,acc,yr,car_type,origin_America,origin_Asia,origin_Europe
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,23.514573,5.454774,193.425879,2970.424623,15.56809,76.01005,0.530151,0.625628,0.198492,0.175879
std,7.815984,1.701004,104.269838,846.841774,2.757689,3.697627,0.499718,0.484569,0.399367,0.381197
min,9.0,3.0,68.0,1613.0,8.0,70.0,0.0,0.0,0.0,0.0
25%,17.5,4.0,104.25,2223.75,13.825,73.0,0.0,0.0,0.0,0.0
50%,23.0,4.0,148.5,2803.5,15.5,76.0,1.0,1.0,0.0,0.0
75%,29.0,8.0,262.0,3608.0,17.175,79.0,1.0,1.0,0.0,0.0
max,46.6,8.0,455.0,5140.0,24.8,82.0,1.0,1.0,1.0,1.0


In [25]:
df['hp'] = df['hp'].astype('float64')

In [26]:
df.dtypes

mpg               float64
cyl                 int64
disp              float64
hp                float64
wt                  int64
acc               float64
yr                  int64
car_type            int64
origin_America      uint8
origin_Asia         uint8
origin_Europe       uint8
dtype: object

In [27]:
#retrieving a subset of the DataFrame
df.iloc[0:4,0:5]

Unnamed: 0,mpg,cyl,disp,hp,wt
0,18.0,8,307.0,130.0,3504
1,15.0,8,350.0,165.0,3693
2,18.0,8,318.0,150.0,3436
3,16.0,8,304.0,150.0,3433


In [28]:
x = df.drop('mpg',axis = 1)

In [30]:
y = df['mpg']

In [31]:
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
393    27.0
394    44.0
395    32.0
396    28.0
397    31.0
Name: mpg, Length: 398, dtype: float64

In [32]:
x.shape

(398, 10)

In [33]:
y.shape

(398,)

In [34]:
# splitting data into training and testing sets
from sklearn.model_selection import train_test_split

In [35]:
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size = 0.30,random_state=1)

In [36]:
x_train.shape

(278, 10)

In [37]:
y_train.shape


(278,)

In [38]:
#creating the model
model = LinearRegression()

In [39]:
# fitting the Linear Regression model to the training data 'x_train' and corresponding target values 'y_train
model.fit(x_train,y_train)

LinearRegression()

In [40]:
model.coef_

array([ 1.47507329,  0.02434932, -0.04686191, -0.00656416,  0.04170228,
        0.79389755,  5.96659744, -1.42434811,  0.65211192,  0.77223618])

In [77]:
#printing the coefficients
for i,j in enumerate(x_train.columns):
    print(" the coeff for {} is {}".format(j,model.coef_[i]))

 the coeff for cyl is 1.4750732919168075
 the coeff for disp is 0.02434932200243154
 the coeff for hp is -0.04686191039325287
 the coeff for wt is -0.006564163296278441
 the coeff for acc is 0.041702279188100454
 the coeff for yr is 0.7938975478842005
 the coeff for car_type is 5.966597439861038
 the coeff for origin_America is -1.424348109547227
 the coeff for origin_Asia is 0.6521119249598692
 the coeff for origin_Europe is 0.7722361845873545


In [78]:
intercept = model.intercept_

In [79]:
intercept

-28.23539556849744

In [82]:
print(f"the intercept of the model is {intercept}")

the intercept of the model is -28.23539556849744


In [83]:
#accuracy measures
model.score(x_test,y_test)

0.8513421387780062

In [85]:
#polynomial regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model

In [86]:
pol = PolynomialFeatures(interaction_only=True)

In [87]:
x_train_ = pol.fit_transform(x_train)
x_test_ = pol.fit_transform(x_test)

In [88]:
x_train_

array([[  1.,   4., 105., ...,   0.,   0.,   0.],
       [  1.,   4.,  97., ...,   0.,   0.,   0.],
       [  1.,   4., 121., ...,   0.,   0.,   0.],
       ...,
       [  1.,   8., 304., ...,   0.,   0.,   0.],
       [  1.,   4.,  97., ...,   0.,   0.,   0.],
       [  1.,   6., 232., ...,   0.,   0.,   0.]])

In [89]:
poly_model = LinearRegression()

In [91]:
poly_model.fit(x_train_,y_train)

LinearRegression()

In [92]:
poly_model.coef_

array([-1.14820098e+08,  1.27399384e+01, -5.07940971e-01,  8.46720395e-01,
       -1.26873791e-02, -5.25653126e+00,  6.86987498e-01, -3.01150272e+01,
        2.73967338e+01, -4.54454577e+01,  1.80487239e+01, -5.65962344e-03,
       -2.09120490e-02,  6.50459389e-04,  5.87347585e-01, -3.46519493e-01,
        9.39381876e+00,  4.94958311e+00,  1.13552536e+01, -3.56489828e+00,
        1.63982036e-04, -8.11007085e-07, -8.59463627e-04,  8.38496301e-03,
       -1.57510220e-02, -8.14847018e-02, -2.71376984e-01, -1.55079285e-01,
       -2.67181156e-06, -5.33340643e-03, -1.28855210e-02, -2.56144215e-02,
        2.40951447e-01,  3.16482050e-01,  2.89286898e-01, -1.87092810e-04,
        2.56390099e-04, -9.15736002e-03, -8.59212729e-03, -1.29333224e-03,
       -2.80191858e-03,  5.25392758e-02,  1.27217843e+00, -1.96885285e+00,
       -2.24083631e+00, -1.04684210e+00,  1.58054542e-01, -8.61101903e-02,
        4.41725690e-01,  3.31371998e-01, -8.95661174e+00, -2.19802976e-01,
       -2.09386125e+01,  

In [93]:
poly_model.score(x_test_,y_test)

0.8626121619724343