In [1]:
import numpy as np
import pandas as pd
from math import *
import numpy.random as nr
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as ss
from sklearn import preprocessing
from sklearn import linear_model as lm
import sklearn .model_selection as ms
import sklearn.metrics as sklm

%matplotlib inline

In [2]:
aw_data = pd.read_csv('AW_data_PrePad.csv')
aw_data.columns

Index(['CountryRegionName', 'Education', 'Occupation', 'Gender',
       'MaritalStatus', 'NumberCarsOwned', 'NumberChildrenAtHome',
       'TotalChildren', 'YearlyIncome', 'AveMonthSpend', 'BikeBuyer', 'Age'],
      dtype='object')

In [3]:
cat_cols = ['Education', 'Occupation', 'Gender', 'MaritalStatus', 'NumberCarsOwned', 
               'NumberChildrenAtHome', 'TotalChildren']
num_cols = ['YearlyIncome', 'Age']


In [4]:
def encode_string(col):
    enc = preprocessing.LabelEncoder()
    enc.fit(col)
    enc_col = enc.transform(col)
    
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_col.reshape(-1,1))
    return encoded.transform(enc_col.reshape(-1,1)).toarray()

Features = encode_string(aw_data['CountryRegionName'])

for col in cat_cols:
    temp = encode_string(aw_data[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)
print(Features[:2, :])

(16404, 37)
[[1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 1. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0.]]


In [5]:
Features = np.concatenate([Features, np.array(aw_data[['YearlyIncome', 'Age']])], axis = 1)
    
print(Features.shape)
print(Features[:2, :])

(16404, 39)
[[1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.37947e+05 5.50000e+01]
 [1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 1.01141e+05 5.50000e+01]]


In [6]:
nr.seed(9988)
labels = np.array(aw_data['AveMonthSpend'])
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 4000)
x_train = Features[indx[0], :]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1], :]
y_test = np.ravel(labels[indx[1]])

In [7]:
def scale(columns):
    for i in range(columns.shape[1]):
        col = columns[:, i]
        m = col.mean()
        s = col.std()
        for j in range(col.shape[0]):
            columns[j, i] = (columns[j, i]-m)/s
            
scale(x_train[:, 37:])
scale(x_test[:, 37:])
x_train[:, 37:]

array([[-0.63575261,  0.91036442],
       [ 0.34830527, -0.24705708],
       [ 1.70677622,  1.80068865],
       ...,
       [-0.09488062,  0.02004019],
       [ 1.13329187,  1.17746169],
       [-0.74224606, -0.3360895 ]])

In [8]:
#so now our features are ready time to find out the coefficients

#we will have to declare a theta first which will have 39 rows

#we will have something called the cost function which we have to reduce
def cost(theta, x_train, y_train):
    theta_transpose = np.transpose(theta)
    m = len(y_train)
    J = np.ones(x_train.shape[1])
    J_theta = 0
    for i in range(x_train.shape[1]):
        J_theta += (((np.dot(x_train[i, :], theta) - y_train[i])) ** 2)
        J[i] = (np.dot(x_train[i, :], theta))
        
    return J_theta/m
    
def gradientdescent(x_train, y_train, alpha, num_iterations):
    m = len(y_train)
    theta = np.zeros((x_train.shape[1]))
    J_history = np.zeros(num_iterations)
    x_train_transpose = np.transpose(x_train)
    for i in range(num_iterations):
        theta = theta - ((alpha)*(1/m)*(np.dot(x_train_transpose, (np.dot(x_train, theta) - y_train))))
        J_history[i] = cost(theta, x_train, y_train)
    
    return theta, J_history

theta_final, J_history = gradientdescent(x_train, y_train, 0.1, 10000)
theta_final

array([  6.43108228,   7.42424375,   7.4308154 ,   6.8198501 ,
         6.83674605,   6.92608988,   9.03491793,   6.65273136,
         8.80386262,   8.82879085,   8.5485247 ,   9.09665257,
         6.35600974,   8.30897366,   9.06713754,   9.04005394,
         6.9328805 ,  34.93594696,  23.9913137 ,  17.87751375,
         9.18621649,   8.83907512,   8.35104633,   8.12416484,
         7.36832467, -19.56443861,  -9.51535   ,   0.52306242,
        10.89321315,  23.4919415 ,  36.04039899,   5.58516278,
         6.6467859 ,   7.25189926,   7.54196585,   7.35054924,
         7.49246444,   8.57878048,  -1.62310972])

In [9]:
# # define and fit the linear regression model
# lin_mod = lm.LinearRegression(fit_intercept = False)
# lin_mod.fit(x_train, y_train)

# print(lin_mod.intercept_)
# print(lin_mod.coef_)

In [10]:
y_score = np.dot(x_test, theta_final)

In [11]:
print(y_test)
y_score


[88 47 47 ... 55 79 83]


array([87.86867946, 53.67507563, 47.21181585, ..., 55.072408  ,
       81.10860837, 85.33014077])

In [12]:
def print_metrics(y_true, y_predicted, n_parameters):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
    
print_metrics(y_test, y_score,28)

Mean Square Error      = 38.3527116291299
Root Mean Square Error = 6.1929566145040855
Mean Absolute Error    = 4.734290168804088
Median Absolute Error  = 3.796899228809611
R^2                    = 0.9494825012937107
Adjusted R^2           = 0.9491391043991816


In [13]:
#now we compare this with the predesigned linear regression model

# define and fit the linear regression model
lin_mod = lm.LinearRegression(fit_intercept = False)
lin_mod.fit(x_train, y_train)

print(lin_mod.intercept_)
print(lin_mod.coef_)

0.0
[ 3.59559583e+12  3.59559583e+12  3.59559583e+12  3.59559583e+12
  3.59559583e+12  3.59559583e+12  1.21815744e+13  1.21815744e+13
  1.21815744e+13  1.21815744e+13  1.21815744e+13 -1.75358331e+13
 -1.75358331e+13 -1.75358331e+13 -1.75358331e+13 -1.75358331e+13
  2.20775735e+12  2.20775735e+12  5.35121122e+13  5.35121122e+13
 -6.42505918e+13 -6.42505918e+13 -6.42505918e+13 -6.42505918e+13
 -6.42505918e+13  2.31638331e+13  2.31638331e+13  2.31638331e+13
  2.31638331e+13  2.31638331e+13  2.31638331e+13 -1.28744479e+13
 -1.28744479e+13 -1.28744479e+13 -1.28744479e+13 -1.28744479e+13
 -1.28744479e+13  8.60241699e+00 -1.61718750e+00]


In [14]:
y_score_lr = lin_mod.predict(x_test) 
print_metrics(y_test, y_score_lr, 28)

Mean Square Error      = 38.524636460360036
Root Mean Square Error = 6.206821768051668
Mean Absolute Error    = 4.749142117259407
Median Absolute Error  = 3.828125
R^2                    = 0.9492560450127772
Adjusted R^2           = 0.948911108762864
