# Multiple Linear Regression Model

### Import the Libraries

In [1]:
import numpy as np
import pandas as pd

### Import the Dataset

In [2]:
dataset = pd.read_csv("CustTransactData-wAvgSales.csv")
dataset.head()

Unnamed: 0,customer_id,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,owns_car,tenure,Age,state,property_valuation,sales_profit
0,1,1,93,1.0,1,1,11.0,64.0,2.0,10.0,274.371818
1,2,0,81,2.0,1,1,16.0,37.0,2.0,10.0,742.086667
2,3,0,61,3.0,1,1,15.0,63.0,2.0,7.0,420.35125
3,4,0,33,4.0,1,0,7.0,56.0,1.0,9.0,110.285
4,5,1,56,7.0,2,1,8.0,40.0,2.0,4.0,399.156667


### Convert Dataset to Input and Target Variables

In [3]:
dataset = dataset.drop(['customer_id'], axis=1)

In [19]:
dataset['wealth_segment'].unique()

array([1, 2, 3], dtype=int64)

In [4]:
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [12]:
print(x[:5, :])

[[ 1. 93.  1.  1.  1. 11. 64.  2. 10.]
 [ 0. 81.  2.  1.  1. 16. 37.  2. 10.]
 [ 0. 61.  3.  1.  1. 15. 63.  2.  7.]
 [ 0. 33.  4.  1.  0.  7. 56.  1.  9.]
 [ 1. 56.  7.  2.  1.  8. 40.  2.  4.]]


### Encoding Categorical Data

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [2, 3, 7])], remainder='passthrough')
x = ct.fit_transform(x)

In [15]:
print(x[:5, :])

[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  1. 93.  1.
  11. 64. 10.]
 [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0. 81.  1.
  16. 37. 10.]
 [ 0.  0.  1.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0. 61.  1.
  15. 63.  7.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.  0.  1.  0.  0.  0. 33.  0.
   7. 56.  9.]
 [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  0.  1. 56.  1.
   8. 40.  4.]]


### Split Dataset to Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.6, random_state=0)

### Training the Multiple Regression model on Training set

In [21]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Predicting the Results

In [22]:
y_pred = regressor.predict(x_test)

In [25]:
np.set_printoptions(precision=2)
print( np.concatenate( ( y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1) ), 1 ))

[[566.15 464.74]
 [537.36 588.83]
 [528.67 586.93]
 ...
 [519.77 782.61]
 [521.58 275.68]
 [589.   366.94]]
