In [53]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import statistics as stat
import seaborn as sns
import joblib

In [54]:
dataset=pd.read_csv('insurance.csv')

In [55]:
dataset.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


### Exploratory Data Analysis

In [56]:
from pandas_profiling import ProfileReport
report=ProfileReport(dataset)
report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
report.to_file('EDA.html')

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
def regression_line(x,y):
    
    m = (sum(x*y) - (sum(x)*sum(y)/len(x)))/(sum(x*x) - (sum(x)*sum(x)/len(x)))
    constant = stat.mean(y) - m*stat.mean(x)
    print(f'linear regression line: Slope = {m:.2f}, Constant = {constant:.2f}')
    
    plt.plot(x, y, 'o')
    reg_line = [(m * i) + constant for i in x]
    plt.plot(x, reg_line)
    plt.show()

In [11]:
for i in dataset.columns:
    plt.figure(figsize=(12, 7))
    sns.scatterplot(data=dataset, x=i, y='charges',hue='charges',palette='cool', legend=False)
    regression_line(dataset[i], dataset['charges'])

linear regression line: Slope = 257.72, Constant = 3165.89


  plt.show()


TypeError: can't multiply sequence by non-int of type 'float'

### Columns

In [12]:
print(dataset.columns)
print("\nTotal number of values in the dataset is :",dataset.__len__())

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

Total number of values in the dataset is : 1338


### Checking For null Values

In [13]:
import missingno as msno
msno.matrix(dataset)

<AxesSubplot:>

In [14]:
dataset.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [59]:
### There are no null values in the dataset

### Removing Missing Data (Optional)

In [15]:
import numpy as np
dataset.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


### Data Encoding

In [16]:
cat_cols = [col for col in dataset.columns if col not in dataset.describe().columns]

In [50]:
dataset.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [17]:
print('we have the following categorical columns:',cat_cols)

we have the following categorical columns: ['sex', 'smoker', 'region']


In [58]:
def categorical_function(dataset):
  new_variable = {
    "sex": {"male": 0, "female": 1},
    "smoker": {"yes": 0, "no": 1},
    "region": {"northeast": 0, "northwest": 1, "southeast": 2, "southwest": 3},
    }
  dataset.replace(new_variable, inplace=True)
  return dataset

In [59]:
dataset_dummy=categorical_function(dataset)

In [60]:
dataset_dummy.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,0,3,16884.924
1,18,0,33.77,1,1,2,1725.5523
2,28,0,33.0,3,1,2,4449.462
3,33,0,22.705,0,1,1,21984.47061
4,32,0,28.88,0,1,1,3866.8552


In [61]:
Y=dataset_dummy["charges"]

In [62]:
dataset_dummy.drop('charges', inplace=True, axis=1)

In [63]:
dataset_dummy.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,1,27.9,0,0,3
1,18,0,33.77,1,1,2
2,28,0,33.0,3,1,2
3,33,0,22.705,0,1,1
4,32,0,28.88,0,1,1
5,31,1,25.74,0,1,2
6,46,1,33.44,1,1,2
7,37,1,27.74,3,1,1
8,37,0,29.83,2,1,0
9,60,1,25.84,0,1,1


In [64]:
X=dataset_dummy.iloc[:].values
Y=Y.iloc[:].values

### Feature Scaling

First we will divide our dataset using train_test_split for training and testing Purposes

In [65]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [66]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [68]:
## Saving Standard Scaler
joblib.dump(sc,'models/standard_scaler.joblib')

['models/standard_scaler.joblib']

### Accuracy Helper Function

In [69]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

def performanceOutPut(y_test,y_pred,model_name=None):
  errors = list()
  print(f"Performance Analysis for {model_name}")
  for i in range(len(y_test)):
    err = (y_test[i] - y_pred[i])**2
    errors.append(err)
    #print('>%.1f, %.1f = %.3f' % (y_test[i], y_pred[i], err))
    plt.title(f"{model_name}")
    plt.plot(errors)
    #plt.xticks(ticks=[i for i in range(len(errors))], labels=y_pred)
    plt.xlabel('Predicted Value')
    plt.ylabel('Mean Squared Error')
    plt.show()
  mse=mean_squared_error(y_test,y_pred)
  rmse=mean_squared_error(y_test,y_pred,squared=False)
  mae=mean_absolute_error(y_test,y_pred)
  print(f"Mean Squared Error MSE :-{mse}")
  print(f"Root Mean Squared Error MSE :-{rmse}")
  print(f"Mean absolute Error MAE :-{mae}")

In [70]:
### Performance Metrics Pandas Dataframe
model_performance=pd.DataFrame()

### Linear Regression

In [71]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train,y_train)
print('Model Parameters are:-',lin_reg.intercept_, lin_reg.coef_,"\n")
linear_pred=lin_reg.predict(x_test)

Model Parameters are:- 13230.161574933638 [ 3602.77541857   122.13671165  1929.20311613   497.80337376
 -9605.29975404  -367.95675551] 



In [72]:
performanceOutPut(y_test,linear_pred,model_name='Linear Regression')

Performance Analysis for Linear Regression
Mean Squared Error MSE :-35480663.81217908
Root Mean Squared Error MSE :-5956.564766052585
Mean absolute Error MAE :-4056.118305662652


  plt.show()


In [73]:
## Saving The Linear Model
filename = 'models/linear_regression.joblib'
joblib.dump(lin_reg, filename)

['models/linear_regression.joblib']

### Polynomial Regression

In [74]:
from sklearn.linear_model import LinearRegression
lin_reg_for_poly = LinearRegression()
lin_reg_for_poly.fit(x_train, y_train)

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
x_poly_train = poly_reg.fit_transform(x_train)
x_poly_test=poly_reg.transform(x_test)

lin_reg_2_for_poly = LinearRegression()
lin_reg_2_for_poly.fit(x_poly_train,y_train)

LinearRegression()

In [75]:
poly_reg_pred=lin_reg_2_for_poly.predict(x_poly_test)

In [76]:
performanceOutPut(y_test,poly_reg_pred,model_name='Polynomial Regression Degree-2')

Performance Analysis for Polynomial Regression Degree-2
Mean Squared Error MSE :-20682287.591864556
Root Mean Squared Error MSE :-4547.778313843426
Mean absolute Error MAE :-2746.505944701493


  plt.show()


In [77]:
##Saving the Model
filename = 'models/polynomial_features.joblib'
filename2 = 'models/polynomial_regression.joblib'

joblib.dump(poly_reg, filename)
joblib.dump(lin_reg_2_for_poly, filename2)

['models/polynomial_regression.joblib']

### Support Vector Machines

In [78]:
from sklearn.svm import SVR
svr_1 = SVR(kernel='sigmoid')
svr_1.fit(x_train,y_train)
print('Model Parameters are:-',svr_1.intercept_,"\n")
svr_1_pred=svr_1.predict(x_test)

Model Parameters are:- [9241.33314757] 



In [79]:
performanceOutPut(y_test,svr_1_pred,model_name="Support Vector Machines Kernel:-Sigmoid")

Performance Analysis for Support Vector Machines Kernel:-Sigmoid
Mean Squared Error MSE :-164706815.73660326
Root Mean Squared Error MSE :-12833.815322677947
Mean absolute Error MAE :-8242.693978701302


  plt.show()


In [80]:
## Saving The Models
filename="models/svr.joblib"
joblib.dump(svr_1, filename)

['models/svr.joblib']

### Random Forests

In [81]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=500, max_leaf_nodes=32, n_jobs=-1,min_samples_leaf=2)
random_forest.fit(x_train, y_train) 
random_forest_pred = random_forest.predict(x_test)

In [82]:
performanceOutPut(y_test,random_forest_pred,model_name="Random Forest Model")

Performance Analysis for Random Forest Model
Mean Squared Error MSE :-17967309.250098944
Root Mean Squared Error MSE :-4238.786294459647
Mean absolute Error MAE :-2347.106560004309


  plt.show()


In [83]:
## Saving the Models
filename="models/random_forest.joblib"
joblib.dump(random_forest, filename)

['models/random_forest.joblib']

### AdaBoost

In [84]:
from sklearn.ensemble import AdaBoostRegressor
adaboost=AdaBoostRegressor()
adaboost.fit(x_train,y_train)
adaboost_pred=adaboost.predict(x_test)

In [85]:
performanceOutPut(y_test,random_forest_pred,model_name="Random Forest Model")

Performance Analysis for Random Forest Model


  plt.show()


Mean Squared Error MSE :-17967309.250098944
Root Mean Squared Error MSE :-4238.786294459647
Mean absolute Error MAE :-2347.106560004309


In [86]:
filename="models/adaboost.joblib"
joblib.dump(adaboost, filename)

['models/adaboost.joblib']