In [1]:
import pandas as pd
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
import numpy as np
import numpy.random as nr
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import math

%matplotlib inline

In [2]:
auto_prices = pd.read_csv('Auto_Data_Preped.csv')
auto_prices.columns

Index(['symboling', 'make', 'fuel_type', 'aspiration', 'num_of_doors',
       'body_style', 'drive_wheels', 'engine_location', 'wheel_base', 'length',
       'width', 'height', 'curb_weight', 'engine_type', 'num_of_cylinders',
       'engine_size', 'fuel_system', 'bore', 'stroke', 'compression_ratio',
       'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg', 'price',
       'log_price'],
      dtype='object')

In [3]:
auto_prices.head()

Unnamed: 0,symboling,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price,log_price
0,3,alfa-romero,gas,std,two,hardtop_convert,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495,9.510075
1,3,alfa-romero,gas,std,two,hardtop_convert,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500,9.711116
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500,9.711116
3,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,mpfi,3.19,3.4,10.0,102,5500,24,30,13950,9.543235
4,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,mpfi,3.19,3.4,8.0,115,5500,18,22,17450,9.767095


# Prepare the model matrix.
1. All scikit-learn models require a numpy array of numeric only values for the features. 
   The resulting array is often referred to as the model matrix. 

2. To create a model matrix from cases with both numeric and categorical variables requires two steps. First, the numeric   
   features must be rescaled. Second, the categorical variables must be converted to a set of dummy variables to encode the 
   presence or not of each category. 

3. Create dummy variables from categorical features
   Now, you must create dummy variables for the categorical features. Dummy variables encode categorical features as a set of  
   binary variables. There is one dummy variable for each possible category. For each case all of the values in the dummy 
   variables are set to zero, except the one corresponding to the category value, which is set to one. In this way, a 
   categorical variable with any number of categories can be encoded as series of numeric features which scikit-learn can 
   operate on. This process is referred to as one hot encoding since only one dummy variable is coded as 1 (hot) per case. 

4. The sklearn.preprocessing package contains functions to encode categorical features as dummy variables in two steps;
   The categories are encoded as numbers starting with 0. For example, if there are 5 categories, they are encoded as the set 
   {0,1,2,3,4}
   {0,1,2,3,4}
.  The numeric categories are then encoded as dummy variables. 
   The following example will give you a feel for how this process works. The code in the cell below computes the numeric  
   representation of the categories for the body_style feature by the following steps:
   An encoder object is created using the LabelEncoder method.
   The encoder is fit to the unique string values of the feature. 
   The transformation method then applies the numeric encoding to the original feature. 
   Execute the code in the cell below and examine the result. 





In [4]:
print(auto_prices['body_style'].unique())
Features = auto_prices['body_style']
enc      = preprocessing.LabelEncoder()
enc.fit(Features)
Features = enc.transform(Features)
print(Features)


['hardtop_convert' 'hatchback' 'sedan' 'wagon']
[0 0 1 2 2 2 2 3 2 2 2 2 2 2 2 2 2 1 1 2 1 1 1 1 2 2 2 3 1 1 1 1 1 1 2 3 1
 1 2 2 2 2 2 1 2 2 2 1 1 1 2 2 1 2 1 2 2 1 2 2 2 3 0 2 2 0 2 0 1 1 1 1 1 1
 1 1 1 1 2 2 2 2 2 2 2 2 3 2 1 2 3 0 1 2 2 3 2 1 1 1 2 2 3 3 2 2 3 3 2 2 2
 1 1 1 2 2 3 1 1 0 0 0 1 2 1 2 1 2 1 1 1 2 2 2 2 2 3 3 3 3 1 1 1 3 3 3 2 1
 2 1 2 1 2 2 1 2 1 0 0 1 0 1 0 2 2 1 2 1 1 1 2 3 2 2 2 2 2 2 2 0 1 2 2 3 2
 3 2 3 2 3 2 2 2 2 2]


Notice that this five original body style categories of this feature is now coded as integers in the set 
{0,1,2,3,4}
{0,1,2,3,4}
.
For the next step in the process, the numerically coded categorical variable is converted to a set of dummy variables following these steps:

A one hot encoder object is created using the OneHotEncoder method from the sklearn.preprocessing module.
The numerically coded categorical feature is fit with the one hot encoder. 
The dummy variables are encoded using the transform method on the encodings.
Execute the code in the cell below and examine the result. 


In [5]:
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(Features.reshape(-1,1))#feature.reshape is for changing rows to cols
Features = encoded.transform(Features.reshape(-1,1)).toarray()
Features[:10,:]# To check Later

array([[ 1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  1.],
       [ 0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.]])

Notice that the body_style feature has been encoded as five columns.
Each of these columns is a dummy variable representing one category. 
Each row has one and only one dummy variable with a 1, and the rest 0s. 
This is the one hot encoding.

Now, you need to one hot encode all five categorical variables and append them as columns to the model matrix with the scaled numeric variables. The code in the cell below executes a for loop that calls the encode_string function and uses the numpy concatenate function to add the dummy variables to the model matrix. The encode_string function uses the same process discussed above. 
Execute this code, verify the result, and answer Question 1 on the course page.

In [6]:
def encode_string(cat_feature):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    

categorical_columns = ['fuel_type', 'aspiration', 'drive_wheels', 'num_of_cylinders']

for col in categorical_columns:
    temp = encode_string(auto_prices[col])
    Features = np.concatenate([Features, temp], axis = 1)

print(Features.shape)
print(Features[:2, :])       

(195, 14)
[[ 1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  0.  1.]
 [ 1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  0.  1.]]


In [14]:
Features = np.concatenate([Features,np.array(auto_prices[['curb_weight','horsepower','city_mpg']])],axis = 1)
Features[:2,:]
print(Features.shape[0])

195


In [15]:
nr.seed(9988)
labels = np.array(auto_prices['log_price'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx,test_size =40)

x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

Rescale numeric features
Numeric features must be rescaled so they have a similar range of values. Rescaling prevents features from having an undue influence on model training simply because then have a larger range of numeric variables. 
The code in the cell below uses the StandardScaler function from the Scikit Learn preprocessing package to Zscore scale the numeric features. Notice that the scaler is fit only on the training data. The trained scaler is these applied to the test data. Test data should always be scaled using the parameters from the training data. 
Execute this code.

In [16]:
scaler = preprocessing.StandardScaler().fit(x_train[:,14:])
x_train[:,14:] = scaler.transform(x_train[:,14:])
x_test[:,14:] = scaler.transform(x_test[:,14:])
print(x_train.shape)
x_train[:5,:]


(155, 35)


array([[ 0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  1.        , -0.5384069 ,
        -1.26225437,  1.33602998, -0.5384069 , -1.26225437,  1.33602998,
        -0.5384069 , -1.26225437,  1.33602998, -0.5384069 , -1.26225437,
         1.33602998, -0.5384069 , -1.26225437,  1.33602998, -0.5384069 ,
        -1.26225437,  1.33602998, -0.5384069 , -1.26225437,  1.33602998],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  1.        ,  0.96837381,
         1.51064566, -1.00126852,  0.96837381,  1.51064566, -1.00126852,
         0.96837381,  1.51064566, -1.00126852,  0.96837381,  1.51064566,
        -1.00126852,  0.96837381,  1.51064566, -1.00126852,  0.96837381,
         1.51064566, -1.00126852,  0.96837381,  1.

In [17]:
lin_mod = linear_model.LinearRegression(fit_intercept = False)
lin_mod.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [18]:
print(lin_mod.intercept_)
print(lin_mod.coef_)

0.0
[ 1.36372345  1.15436725  1.27397827  1.16791069  2.57778903  2.38219063
  2.48184692  2.47813274  1.62299441  1.62646566  1.71051959  1.70397511
  1.71596353  1.54004103  0.02572832  0.02021161 -0.01042417  0.02572832
  0.02021161 -0.01042417  0.02572832  0.02021161 -0.01042417  0.02572832
  0.02021161 -0.01042417  0.02572832  0.02021161 -0.01042417  0.02572832
  0.02021161 -0.01042417  0.02572832  0.02021161 -0.01042417]


In [19]:
def print_metrics(y_true, y_predicted, n_parameters):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
   
y_score = lin_mod.predict(x_test) 
print_metrics(y_test, y_score, 28)    

Mean Square Error      = 0.0226162077444
Root Mean Square Error = 0.1503868602784332
Mean Absolute Error    = 0.119126784368
Median Absolute Error  = 0.106964494329
R^2                    = 0.921638632103
Adjusted R^2           = 0.745325554333
