# <center> Predicting the Cost of Insurance using Linear Regression </center>
#### *<center>By Peter Kayode</center>*

### Importation of neccessary libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import r2_score

In [2]:
dataset = pd.read_csv("insurance.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


### Data Preprocessing

In [3]:
# Converting the Categorical Field into Numercial field

label = LabelEncoder()

# Sex
label.fit(dataset.sex.drop_duplicates())
dataset.sex = label.transform(dataset.sex)

# Smoker
label.fit(dataset.smoker.drop_duplicates())
dataset.smoker = label.transform(dataset.smoker)

# Region
label.fit(dataset.region.drop_duplicates())
dataset.region = label.transform(dataset.region)

In [4]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,0,27.9,0,1,3,16884.92
1,18,1,33.8,1,0,2,1725.55
2,28,1,33.0,3,0,2,4449.46
3,33,1,22.7,0,0,1,21984.47
4,32,1,28.9,0,0,1,3866.86
...,...,...,...,...,...,...,...
1333,50,1,31.0,3,0,1,10600.55
1334,18,0,31.9,0,0,0,2205.98
1335,18,0,36.9,0,0,2,1629.83
1336,21,0,25.8,0,0,3,2007.95


### Splitting the Data into Training and Testing

In [5]:
x = dataset.drop(['expenses'], axis = 1)
y = dataset[["expenses"]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

### Fitting the Linear Regression Model

In [6]:
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train)

### Checking out the Coefficients and Intercept

In [7]:
for idx, col_name in enumerate(x_train.columns):
    print("The coefficient for {} is {}" .format(col_name, linear_reg.coef_[0][idx]))

The coefficient for age is 261.61211626675913
The coefficient for sex is 109.84388797772343
The coefficient for bmi is 344.5952715352502
The coefficient for children is 424.66030781483255
The coefficient for smoker is 23620.314383712615
The coefficient for region is -326.0971383644361


In [8]:
intercept = linear_reg.intercept_[0]
intercept

-12367.009839572052

### Preditions

In [9]:
y_pred = linear_reg.predict(x_test)
pred = pd.DataFrame(x_test)
pred["cost"] = y_test
pred["predicted_Cost"] = y_pred
pred

Unnamed: 0,age,sex,bmi,children,smoker,region,cost,predicted_Cost
764,45,0,25.2,2,0,0,9095.07,8938.656851
887,36,0,30.0,0,0,1,5272.18,7062.787354
890,64,0,26.9,0,1,1,29330.98,36939.995651
1293,46,1,25.7,3,0,1,9301.89,9580.973660
259,19,1,31.9,0,1,1,33750.29,27000.270665
...,...,...,...,...,...,...,...,...
701,50,0,44.7,0,0,0,9541.70,16117.004611
672,36,1,29.7,0,0,2,4399.73,6743.155522
1163,18,0,28.2,0,0,0,2200.83,2059.594911
1103,58,1,36.1,0,0,2,11363.28,14704.031818


### Evaluating the model

In [10]:
r2_score(y_test, y_pred)

0.7694626233326285