In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
insurance_data = pd.read_csv("insurance.csv")

In [None]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
insurance_data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [None]:
insurance_data['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [None]:
# Using Label Encoding:

le = LabelEncoder()

insurance_data['sex'] = le.fit_transform(insurance_data['sex'])
insurance_data['smoker'] = le.fit_transform(insurance_data['smoker'])

In [None]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830
1334,18,0,31.920,0,0,northeast,2205.98080
1335,18,0,36.850,0,0,southeast,1629.83350
1336,21,0,25.800,0,0,southwest,2007.94500


- Regression with one hot encoding

In [None]:
insurance_data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [None]:
insurance_data = insurance_data[['age', 'bmi', 'children', 'sex', 'smoker', 'region', 'charges']]

In [None]:
insurance_data.head()

Unnamed: 0,age,bmi,children,sex,smoker,region,charges
0,19,27.9,0,0,1,southwest,16884.924
1,18,33.77,1,1,0,southeast,1725.5523
2,28,33.0,3,1,0,southeast,4449.462
3,33,22.705,0,1,0,northwest,21984.47061
4,32,28.88,0,1,0,northwest,3866.8552


In [None]:
X = insurance_data.iloc[:, 0:-1].values
y = insurance_data.iloc[:, -1].values

In [None]:
X

array([[19, 27.9, 0, 0, 1, 'southwest'],
       [18, 33.77, 1, 1, 0, 'southeast'],
       [28, 33.0, 3, 1, 0, 'southeast'],
       ...,
       [18, 36.85, 0, 0, 0, 'southeast'],
       [21, 25.8, 0, 0, 0, 'southwest'],
       [61, 29.07, 0, 0, 1, 'northwest']], shape=(1338, 6), dtype=object)

In [None]:
X[0]

array([19, 27.9, 0, 0, 1, 'southwest'], dtype=object)

In [None]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [-1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X

array([[0.0, 0.0, 1.0, ..., 0, 0, 1],
       [0.0, 1.0, 0.0, ..., 1, 1, 0],
       [0.0, 1.0, 0.0, ..., 3, 1, 0],
       ...,
       [0.0, 1.0, 0.0, ..., 0, 0, 0],
       [0.0, 0.0, 1.0, ..., 0, 0, 0],
       [1.0, 0.0, 0.0, ..., 0, 0, 1]], shape=(1338, 8), dtype=object)

In [None]:
X[0, :]

array([0.0, 0.0, 1.0, 19, 27.9, 0, 0, 1], dtype=object)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
sc = StandardScaler()
X_train[:, 3:6] = sc.fit_transform(X_train[:, 3:6])
X_test[:, 3:6] = sc.transform(X_test[:, 3:6])

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

In [None]:
y_pred = lin_reg.predict(X_test)

In [None]:
lin_reg.score(X_train, y_train)

0.7417255854683333

In [None]:
lin_reg.score(X_test, y_test)

0.7835929767120722

# ElasticNet Regression
- Although deleted, Lasso regression and ridge regression were giving me almost the same values as ElasticNet
- So i deduced that i was underfitting, which it seems I was

In [None]:
from sklearn.linear_model import ElasticNet

elastic_reg = ElasticNet(alpha=0.001)
elastic_reg.fit(X_train, y_train)

In [None]:
# training & test set performance (Rsqr score):
print(f"Training set score: {elastic_reg.score(X_train, y_train):.3f}")
print(f"Test set score: {elastic_reg.score(X_test, y_test):.3f}")

Training set score: 0.742
Test set score: 0.783


# RFC without CV
- Upon using rfc, I concluded that i was underfitting

In [None]:
rfc_reg = RandomForestRegressor().fit(X_train, y_train)

# training & test set performance (Rsqr score):
print(f"Training set score: {rfc_reg.score(X_train, y_train):.3f}")
print(f"Test set score: {rfc_reg.score(X_test, y_test):.3f}")

Training set score: 0.975
Test set score: 0.865


In [None]:
y_pred_rfc = rfc_reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, y_pred_rfc)


2550.5482185870037

# - Using RandomizedSearchCV and KFOls

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rfc = RandomForestRegressor()

param_dist = {
    'n_estimators': np.arange(50, 400, 50),
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20],
    'min_samples_leaf': [1, 2, 4, 6, 10],
}

In [None]:
random_search = RandomizedSearchCV(estimator=rfc, 
                                   param_distributions=param_dist, 
                                   n_iter=50, scoring='r2', 
                                   cv=5, 
                                   verbose=2, 
                                   random_state=42, 
                                   n_jobs=-1)

In [None]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
print(f"Best Parameters: {random_search.best_params_}")
best_rfc_model = random_search.best_estimator_
r2 = best_rfc_model.score(X_test, y_test)
print(f"Test Set Accuracy: {r2}")

Best Parameters: {'n_estimators': np.int64(100), 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_depth': None}
Test Set Accuracy: 0.8770270738224072


In [None]:
best_rfc_model.score(X_train, y_train)

0.8875634235769966

In [None]:
best_rfc_model.score(X_test, y_test)

0.8770270738224072

In [None]:
insurance_data.columns

Index(['age', 'bmi', 'children', 'sex', 'smoker', 'region', 'charges'], dtype='object')

In [None]:
for i in X:
    for j in i:
        print(j, end=' ')
    break

0.0 0.0 1.0 19 27.9 0 0 1 

In [None]:
best_rfc_model.feature_importances_

array([6.11836094e-04, 1.10786783e-03, 1.07979191e-03, 1.21728047e-01,
       1.83303432e-01, 9.37681395e-03, 1.39023958e-03, 6.81401972e-01])

In [None]:
insurance_data

Unnamed: 0,age,bmi,children,sex,smoker,region,charges
0,19,27.900,0,0,1,southwest,16884.92400
1,18,33.770,1,1,0,southeast,1725.55230
2,28,33.000,3,1,0,southeast,4449.46200
3,33,22.705,0,1,0,northwest,21984.47061
4,32,28.880,0,1,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,northwest,10600.54830
1334,18,31.920,0,0,0,northeast,2205.98080
1335,18,36.850,0,0,0,southeast,1629.83350
1336,21,25.800,0,0,0,southwest,2007.94500


In [None]:
insurance_data.drop('region', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  insurance_data.drop('region', axis=1, inplace=True)


In [None]:
insurance_data

Unnamed: 0,age,bmi,children,sex,smoker,charges
0,19,27.900,0,0,1,16884.92400
1,18,33.770,1,1,0,1725.55230
2,28,33.000,3,1,0,4449.46200
3,33,22.705,0,1,0,21984.47061
4,32,28.880,0,1,0,3866.85520
...,...,...,...,...,...,...
1333,50,30.970,3,1,0,10600.54830
1334,18,31.920,0,0,0,2205.98080
1335,18,36.850,0,0,0,1629.83350
1336,21,25.800,0,0,0,2007.94500


In [None]:
X = insurance_data.iloc[:, 0:-1]
y = insurance_data.iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
best_rfc_model.fit(X_train, y_train)

In [None]:
best_rfc_model.score(X_train, y_train)

0.8853763515695019

In [None]:
best_rfc_model.score(X_test, y_test)

0.8790133976557712

In [None]:
best_rfc_model.fit(X, y)

# Saving the model

In [None]:
import joblib

with open("rfc_insurance_charge_pred", "wb") as model_file:
    joblib.dump(best_rfc_model, model_file)