# Linear Regression / Polynomial Regression

## Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Importing the dataset

In [2]:
dataset = pd.read_csv("data/insurance.csv")

In [3]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [5]:
X.shape, y.shape

((1338, 6), (1338,))

## Encoding categorical data

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [7]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1, 4, 5])], remainder='passthrough')

In [8]:
X = np.array(ct.fit_transform(X))

In [9]:
X[0]

array([1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 19, 27.9, 0], dtype=object)

## Splitting the dataset into the Training set and Test set

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [12]:
X_test

array([[0.0, 1.0, 1.0, ..., 52, 30.2, 1],
       [1.0, 0.0, 1.0, ..., 47, 29.37, 1],
       [0.0, 1.0, 0.0, ..., 48, 40.565, 2],
       ...,
       [0.0, 1.0, 1.0, ..., 57, 40.28, 0],
       [1.0, 0.0, 0.0, ..., 30, 39.05, 3],
       [0.0, 1.0, 1.0, ..., 46, 24.795, 3]], dtype=object)

## Training the Linear Regression model on the Training set

In [13]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [14]:
regressor.fit(X_train, y_train)

LinearRegression()

In [15]:
regressor.score(X_train, y_train)

0.7370262574551634

In [16]:
regressor.coef_

array([ 7.73186394e+00, -7.73186394e+00, -1.18025086e+04,  1.18025086e+04,
        4.83840068e+02,  2.23707336e+02, -4.29438766e+02, -2.78108638e+02,
        2.53700500e+02,  3.35962814e+02,  4.36910121e+02])

In [17]:
regressor.intercept_

-517.1368358426444

## Training the Polynomial Regression model on the Training set

In [18]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)

In [19]:
poly_regressor = LinearRegression()
poly_regressor.fit(X_poly, y_train)

LinearRegression()

In [20]:
poly_regressor.score(X_poly, y_train)

0.8672828655744662

In [21]:
poly_regressor.coef_

array([ 7.29828322e+08,  3.82025236e+07, -1.28024439e+08, ...,
       -9.98693049e-01,  1.15110625e+01,  2.75643297e+01])

In [22]:
poly_regressor.intercept_

-657604157.2352935

## Predicting the Linear Regression Test set results

In [23]:
y_pred =  regressor.predict(X_test)

In [24]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[1.12e+04 9.72e+03]
 [9.49e+03 8.55e+03]
 [3.82e+04 4.57e+04]
 [1.63e+04 1.30e+04]
 [6.91e+03 9.64e+03]
 [3.96e+03 4.50e+03]
 [1.58e+03 2.20e+03]
 [1.44e+04 1.14e+04]
 [9.01e+03 7.54e+03]
 [7.51e+03 5.43e+03]
 [4.49e+03 6.75e+03]
 [1.03e+04 1.05e+04]
 [8.80e+03 7.34e+03]
 [3.80e+03 4.19e+03]
 [2.79e+04 1.83e+04]
 [1.07e+04 1.07e+04]
 [1.13e+04 1.25e+04]
 [6.11e+03 3.49e+03]
 [8.24e+03 6.46e+03]
 [2.71e+04 3.35e+04]
 [3.36e+04 2.40e+04]
 [1.44e+04 1.26e+04]
 [1.17e+04 2.30e+04]
 [3.21e+04 2.31e+04]
 [4.17e+03 1.67e+03]
 [9.25e+03 4.67e+03]
 [1.08e+03 3.73e+03]
 [9.80e+03 7.68e+03]
 [3.77e+03 3.76e+03]
 [1.04e+04 8.41e+03]
 [9.01e+03 8.06e+03]
 [4.01e+04 4.90e+04]
 [1.57e+04 1.30e+04]
 [1.39e+04 2.06e+04]
 [2.48e+04 1.46e+04]
 [5.17e+03 4.14e+03]
 [1.26e+04 8.35e+03]
 [3.08e+04 5.12e+04]
 [3.35e+04 4.00e+04]
 [3.67e+03 1.88e+03]
 [3.98e+03 5.46e+03]
 [3.99e+03 2.87e+03]
 [3.05e+04 2.01e+04]
 [3.95e+04 4.75e+04]
 [2.78e+04 3.61e+04]
 [5.09e+03 2.60e+04]
 [1.06e+04 1.97e+04]
 [7.83e+03 6.

In [25]:
df = pd.DataFrame({'Orginal value': y_test.flatten(), 'Predicted value': y_pred.flatten()})
df

Unnamed: 0,Orginal value,Predicted value
0,9724.53000,11169.927119
1,8547.69130,9486.709085
2,45702.02235,38181.123053
3,12950.07120,16266.313289
4,9644.25250,6914.648007
...,...,...
263,15019.76005,14760.230968
264,6664.68595,8277.984346
265,20709.02034,16149.973370
266,40932.42950,32904.758143


## Predicting the Polynomial Regression Test set results

In [26]:
y_poly_pred =  poly_regressor.predict(poly_reg.fit_transform(X_test))

In [27]:
y_poly_pred.size

268

In [28]:
np.set_printoptions(precision=2)
print(np.concatenate((y_poly_pred.reshape(len(y_poly_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[12113.07  9724.53]
 [ 9134.57  8547.69]
 [59243.85 45702.02]
 [10514.69 12950.07]
 [16006.92  9644.25]
 [ 2773.91  4500.34]
 [ 3157.01  2198.19]
 [12724.59 11436.74]
 [ 8035.99  7537.16]
 [ 8771.91  5425.02]
 [ 7814.86  6753.04]
 [10781.16 10493.95]
 [ 8495.02  7337.75]
 [ 8582.91  4185.1 ]
 [21702.46 18310.74]
 [14700.29 10702.64]
 [13141.9  12523.6 ]
 [ 8423.63  3490.55]
 [ 7930.93  6457.84]
 [28722.99 33475.82]
 [23073.98 23967.38]
 [15884.43 12643.38]
 [11169.11 23045.57]
 [29313.09 23065.42]
 [ 3383.62  1674.63]
 [ 9552.3   4667.61]
 [ 6816.68  3732.63]
 [ 9028.02  7682.67]
 [ 7355.92  3756.62]
 [10673.3   8413.46]
 [ 7620.02  8059.68]
 [47661.35 48970.25]
 [13241.07 12979.36]
 [11200.29 20630.28]
 [14421.14 14571.89]
 [ 5338.73  4137.52]
 [ 9212.26  8347.16]
 [39864.3  51194.56]
 [39993.13 40003.33]
 [ 3348.11  1880.49]
 [ 6994.13  5458.05]
 [ 3761.69  2867.12]
 [27226.03 20149.32]
 [56223.07 47496.49]
 [34768.98 36149.48]
 [ 8450.44 26018.95]
 [14638.79 19749.38]
 [ 8106.63  6

In [29]:
df = pd.DataFrame({'Orginal value': y_test.flatten(), 'Predicted value': y_poly_pred.flatten()})
df

Unnamed: 0,Orginal value,Predicted value
0,9724.53000,12113.068172
1,8547.69130,9134.565914
2,45702.02235,59243.847876
3,12950.07120,10514.689023
4,9644.25250,16006.919309
...,...,...
263,15019.76005,18628.096411
264,6664.68595,7998.008907
265,20709.02034,10143.043149
266,40932.42950,40760.587962
