In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [23]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [24]:
x = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]  # Output

In [27]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [28]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [29]:
data['sex'] = data['sex'].map({'female': 0, 'male': 1})
data['smoker'] = data['smoker'].map({'no': 0, 'yes': 1})
data['region'] = data['region'].map({'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3})

In [30]:
x = data.drop('charges', axis=1)  # Features
y = data['charges']  # Output

In [31]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #30% for testing

In [32]:
print("shape of original dataset :", data.shape)
print("shape of input - training set", x_train.shape)
print("shape of output - training set", y_train.shape)
print("shape of input - testing set", x_test.shape)
print("shape of output - testing set", y_test.shape)

shape of original dataset : (1338, 7)
shape of input - training set (936, 6)
shape of output - training set (936,)
shape of input - testing set (402, 6)
shape of output - testing set (402,)


In [33]:
class MultipleLinearRegression:
    
    def __init__(self):
        self.coefficients = []

    def fit(self, x, y):
        x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)
        self.coefficients = np.linalg.inv(x.transpose().dot(x)).dot(x.transpose()).dot(y)

    def predict(self, x):
        x = np.concatenate((np.ones((x.shape[0], 1)), x), axis=1)
        
        return x.dot(self.coefficients)
    
    def score(self, x, y):
        # Calculate R-squared
        y_pred = self.predict(x)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

    def summary(self):
        # summary of the model
        print(f'Intercept: {self.coefficients[0]}')
        print(f'Coefficients: {self.coefficients[1:]}')
        
        #the score
        score = model.score(x_test.values, y_test.values)
        print(f'Testing Score: {score}')
        
        Tscore = model.score(x_train.values, y_train.values)
        print(f'Training Score: {Tscore}')
        
        mse = np.mean((predictions - y_test.values) ** 2)
        print(f'Mean Squared Error: {mse}')
        

In [34]:
# train the model
model = MultipleLinearRegression()
model.fit(x_train, y_train)

In [35]:
predictions = model.predict(x_test)

In [36]:
model.summary()

Intercept: -12659.093548637778
Coefficients: [  256.80772353   -36.84947221   332.0146003    468.33515753
 23449.13776336   207.25521637]
Testing Score: 0.7907514220754802
Training Score: 0.730400823808963
Mean Squared Error: 33368758.796567157


In [15]:
predictions = model.predict(x_test.values)
print(predictions)

[ 1.13604899e+04  9.63047343e+03  3.81065737e+04  1.63338141e+04
  7.05272954e+03  3.54684209e+03  1.11492011e+03  1.45452227e+04
  9.15653214e+03  7.60590109e+03  4.20016236e+03  1.04451514e+04
  8.98646549e+03  3.88683626e+03  2.77711850e+04  1.09358292e+04
  1.14088811e+04  5.65105775e+03  8.33593364e+03  2.64917277e+04
  3.36411266e+04  1.45093087e+04  1.13603693e+04  3.20999735e+04
  4.16595995e+03  8.80051482e+03  6.86565582e+02  9.97470268e+03
  3.84800169e+03  1.05436257e+04  9.16548540e+03  4.00322662e+04
  1.58507371e+04  1.40198816e+04  2.41808796e+04  4.73281638e+03
  1.27268799e+04  3.07047328e+04  3.35025676e+04  3.74040233e+03
  3.59264482e+03  4.02796889e+03  2.99517120e+04  3.94760782e+04
  2.76473213e+04  5.14254175e+03  1.08262644e+04  7.91704638e+03
  3.21404415e+03  1.04089920e+04  5.23768356e+03  3.52019211e+03
  3.30519647e+04  3.80078950e+04  1.61504863e+04  6.67412103e+03
  5.74101493e+03  9.53471440e+03  9.16147748e+03  1.18772163e+04
  1.91947797e+03  3.89529