In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [4]:
df = pd.read_csv('E:\\MYLEARN\\2-ANALYTICS-DataScience\\datasets\\Auto.csv')

In [5]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [6]:
# We don’t need the name column, so let’s remove this
df = df.drop('name', axis=1)

In [7]:
# Also note that the column "origin" is where the car came from and 
# this is an ordinal categorical variable so we will need to create the dummy binary variables 
# for this
df['origin'].unique()

array([1, 3, 2], dtype=int64)

In [8]:
df['origin'] = df['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})

In [9]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130,3504,12.0,70,america
1,15.0,8,350.0,165,3693,11.5,70,america
2,18.0,8,318.0,150,3436,11.0,70,america
3,16.0,8,304.0,150,3433,12.0,70,america
4,17.0,8,302.0,140,3449,10.5,70,america


In [10]:
df = pd.get_dummies(df, columns=['origin'])

In [11]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin_america,origin_asia,origin_europe
0,18.0,8,307.0,130,3504,12.0,70,1,0,0
1,15.0,8,350.0,165,3693,11.5,70,1,0,0
2,18.0,8,318.0,150,3436,11.0,70,1,0,0
3,16.0,8,304.0,150,3433,12.0,70,1,0,0
4,17.0,8,302.0,140,3449,10.5,70,1,0,0


In [12]:
# check for missing
df = df.replace('?', np.nan)
df = df.dropna()

In [13]:
df.shape

(392, 10)

In [14]:
# extract the data, into numpy array
X = df.drop('mpg', axis=1)

y = df[['mpg']]

In [15]:
# split our data into a training and test set:
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [16]:
# Instantiate the linear regression model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [17]:
# contains an array of intercepts (β0 values)
linreg.intercept_ 

array([-19.80918385])

In [17]:
# view the coefficients of the model
linreg.coef_

array([[-0.24633756,  0.02387034, -0.00601724, -0.00733643,  0.21897778,
         0.78518011, -1.76249341,  0.80962692,  0.95286649]])

In [18]:
# view the coefficients of the model
for idx, name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(name, linreg.coef_[0][idx]))

The coefficient for cylinders is -0.2463375586996167
The coefficient for displacement is 0.02387033830714957
The coefficient for horsepower is -0.006017238617773325
The coefficient for weight is -0.007336432943899315
The coefficient for acceleration is 0.21897778104124824
The coefficient for year is 0.7851801072779493
The coefficient for origin_america is -1.7624934092199247
The coefficient for origin_asia is 0.8096269190858489
The coefficient for origin_europe is 0.9528664901340749


In [19]:
# So we can write our linear model as:

# Y=−19.81 – 0.25×X1 + 0.02×X2 –0.01×X3 –0.01×X4 +0.22×X5 +0.78×X6 –1.76×X7 +0.81×X8 +0.95×X9

# Note that, because we’ve not done any feature scaling or dimensionality reduction, 
# we can’t say anything about the relative importance of each of our features given these 
# coefficients because the features are not of the same scale.

In [20]:
# Scoring Model
# A common method of measuring the accuracy of regression models is to use the R2 statistic.

# The R2 statistic is defined as follows:

# R2=1–RSSTSS
# The RSS (Residual sum of squares) measures the variability left unexplained after performing 
# the regression
# The TSS measues the total variance in Y
# Therefore the R2 statistic measures proportion of variability in Y that is explained by X 
# using our model

linreg.score(X_test, y_test)

0.82852313164597735

In [21]:
# So in our model, 82.85% of the variability in Y can be explained using X

In [22]:
y_predict = linreg.predict(X_test)

linreg_mse = mean_squared_error(y_predict, y_test)

linreg_mse

12.230963834602667

In [23]:
math.sqrt(linreg_mse)

3.4972794904900963

In [24]:
# So we are an average of 3.50 mpg away from the ground truth mpg when making predictions on 
# our test set.

In [24]:
# Making Predictions
# We can use our model to predict the miles per gallon for another, unseen car
# Cylinders – 4
# Displacement – 121
# Horsepower – 110
# Weight – 2800
# Acceleration – 15.4
# Year – 81
# Origin – Asia

# origin_america	origin_asia	origin_europe
# 0 - 1 - 0
linreg.predict([[4, 121, 110, 2800, 15.4, 81, 0, 1, 0]])

array([[ 28.6713418]])