In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [62]:
df = pd.read_csv("Insurance_Analyzed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,0,19,0,27.9,0,1,southwest,16884.92
1,1,18,1,33.8,1,0,southeast,1725.55
2,2,28,1,33.0,3,0,southeast,4449.46
3,3,33,1,22.7,0,0,northwest,21984.47
4,4,32,1,28.9,0,0,northwest,3866.86


In [63]:
df.drop(["Unnamed: 0", "region"], axis = 1, inplace = True)

***Train_Test_Split***

In [64]:
X = df.drop("expenses", axis = 1)
y = (df["expenses"] ** (1/4))

In [65]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
sc = StandardScaler()
X_train.loc[:, ["age", "bmi", "children"]] = sc.fit_transform(X_train.loc[:, ["age", "bmi", "children"]])
X_test.loc[:, ["age", "bmi", "children"]] = sc.transform(X_test.loc[:, ["age", "bmi", "children"]])

# ***Linear Regression***

In [67]:
li_model = LinearRegression().fit(X_train, y_train)
li_pred_test = li_model.predict(X_test)

print("Training Accuracy :", li_model.score(X_train, y_train))
print("Testing Accuracy :", li_model.score(X_test, y_test))

score = cross_val_score(li_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, li_pred_test))

Training Accuracy : 0.7756707009732438
Testing Accuracy : 0.7774416801461649
Cross Validation Score : 0.7727982141169777
MAE : 0.7281463246360949


# ***Polynomial Regression***

In [43]:
# After Hyperparameter Tunning degree = 1  which indiactes Linear Regression

# ***Ridge***

In [68]:
ridge_model = Ridge(alpha = 0.1).fit(X_train, y_train)
ridge_pred_test = ridge_model.predict(X_test)

print("Training Accuracy :", ridge_model.score(X_train, y_train))
print("Testing Accuracy :", ridge_model.score(X_test, y_test))

score = cross_val_score(ridge_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, ridge_pred_test))

Training Accuracy : 0.7756705141593476
Testing Accuracy : 0.7774377247349412
Cross Validation Score : 0.772800422335522
MAE : 0.7282628345531125


# ***Lasso***

In [69]:
lasso_model = Lasso(alpha = 0.001).fit(X_train, y_train)
lasso_pred_test = lasso_model.predict(X_test)

print("Training Accuracy :", lasso_model.score(X_train, y_train))
print("Testing Accuracy :", lasso_model.score(X_test, y_test))

score = cross_val_score(lasso_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, lasso_pred_test))

Training Accuracy : 0.7756679954091377
Testing Accuracy : 0.777345736538896
Cross Validation Score : 0.7727986794799785
MAE : 0.7284826545308474


# ***ElasticNet***

In [70]:
elastic_model = ElasticNet(alpha = 0.001, l1_ratio = 0).fit(X_train, y_train)
elastic_pred_test = elastic_model.predict(X_test)

print("Training Accuracy :", elastic_model.score(X_train, y_train))
print("Testing Accuracy :", elastic_model.score(X_test, y_test))

score = cross_val_score(elastic_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, elastic_pred_test))

Training Accuracy : 0.7756495899580139
Testing Accuracy : 0.777381572206812
Cross Validation Score : 0.7727977864985611
MAE : 0.7293857322924027


# ***Support Vector Machine - SVR***

In [71]:
svm_model = SVR(C = 10, kernel = "rbf").fit(X_train, y_train)
svm_pred_test = svm_model.predict(X_test)

print("Training Accuracy :", svm_model.score(X_train, y_train))
print("Testing Accuracy :", svm_model.score(X_test, y_test))

score = cross_val_score(svm_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, svm_pred_test))

Training Accuracy : 0.8270235199953618
Testing Accuracy : 0.8328535345751463
Cross Validation Score : 0.8160320542165544
MAE : 0.4384033042401969


# ***KNearestNeighbors - KNeighborsRegressor***

In [73]:
knn_model = KNeighborsRegressor(n_neighbors = 6).fit(X_train, y_train)
knn_pred_test = knn_model.predict(X_test)

print("Training Accuracy :", knn_model.score(X_train, y_train))
print("Testing Accuracy :", knn_model.score(X_test, y_test))

score = cross_val_score(knn_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, knn_pred_test))

Training Accuracy : 0.827245463910096
Testing Accuracy : 0.778508364699954
Cross Validation Score : 0.7491571237567923
MAE : 0.6871901909977165


# ***Decision Tree - DecisionTreeRegressor***

In [74]:
decision_model = DecisionTreeRegressor(max_depth = 4, random_state = 0).fit(X_train, y_train)
decision_pred_test = decision_model.predict(X_test)

print("Training Accuracy :", decision_model.score(X_train, y_train))
print("Testing Accuracy :", decision_model.score(X_test, y_test))

score = cross_val_score(decision_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, decision_pred_test))

Training Accuracy : 0.8365960723009352
Testing Accuracy : 0.8308490658722979
Cross Validation Score : 0.818685541624299
MAE : 0.5946100303222766


# ***RandomForestRegressor***

In [76]:
random_forest_model = RandomForestRegressor(n_estimators = 32, random_state = 0).fit(X_train, y_train)
random_forest_pred_test = random_forest_model.predict(X_test)

print("Training Accuracy :", random_forest_model.score(X_train, y_train))
print("Testing Accuracy :", random_forest_model.score(X_test, y_test))

score = cross_val_score(random_forest_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, random_forest_pred_test))

Training Accuracy : 0.9636846773056985
Testing Accuracy : 0.819008235503097
Cross Validation Score : 0.78408320348403
MAE : 0.5756904471946709


# ***AdaBoostRegressor***

In [78]:
ada_model = AdaBoostRegressor(n_estimators = 1, random_state = 0).fit(X_train, y_train)
ada_pred_test = ada_model.predict(X_test)

print("Training Accuracy :", ada_model.score(X_train, y_train))
print("Testing Accuracy :", ada_model.score(X_test, y_test))

score = cross_val_score(ada_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, ada_pred_test))

Training Accuracy : 0.8147174752898869
Testing Accuracy : 0.7926018798821244
Cross Validation Score : 0.79864356151723
MAE : 0.6768659218937322


# ***GradientBoostingRegressor***

In [81]:
gradient_model = GradientBoostingRegressor(learning_rate = 0.2, n_estimators = 18, random_state = 0).fit(X_train, y_train)
gradient_pred_test = gradient_model.predict(X_test)

print("Training Accuracy :", gradient_model.score(X_train, y_train))
print("Testing Accuracy :", gradient_model.score(X_test, y_test))

score = cross_val_score(gradient_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, gradient_pred_test))

Training Accuracy : 0.8493908270734798
Testing Accuracy : 0.842541145547055
Cross Validation Score : 0.8300996865355259
MAE : 0.5513492130166268


# ***XGBRegressor***

In [82]:
xgb_model = XGBRegressor(n_estimators = 13, learning_rate = 0.2, gamma = 0.3, random_state = 0).fit(X_train, y_train)
xgb_pred_test = xgb_model.predict(X_test)

print("Training Accuracy :", xgb_model.score(X_train, y_train))
print("Testing Accuracy :", xgb_model.score(X_test, y_test))

score = cross_val_score(xgb_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, xgb_pred_test))

Training Accuracy : 0.888257068058494
Testing Accuracy : 0.8269033337873133
Cross Validation Score : 0.8118259042037727
MAE : 0.5954070934307115


# ***Final Model***

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 3)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

gradient_model = GradientBoostingRegressor(learning_rate = 0.2, n_estimators = 55, random_state = 0).fit(X_train, y_train)
gradient_pred_test = gradient_model.predict(X_test)

print("Training Accuracy :", gradient_model.score(X_train, y_train))
print("Testing Accuracy :", gradient_model.score(X_test, y_test))

score = cross_val_score(gradient_model, X_train, y_train, cv = 5).mean()
print("Cross Validation Score :", score)

print("MAE :", mean_absolute_error(y_test, gradient_pred_test))

Training Accuracy : 0.9988502312675592
Testing Accuracy : 0.9853348392640189
Cross Validation Score : 0.9785454046018517
MAE : 465.5312141359818


***Prediction on New Data***

In [101]:
age = int(input("Enter Age : "))
gender = input("Type your Gender: ").lower()
bmi = float(input("Enter your BMI:"))
children = int(input("Enter number of children :"))
smoker = input("Are you a smoker ?").lower()
region = input("In which region you belong:")

new_data = pd.DataFrame({"age": age, "sex": gender, "bmi": bmi, "children": children, "smoker": smoker, "region": region}, index = [0])

new_data["sex"].replace({"male": 1, "female": 0}, inplace = True)
new_data["smoker"].replace({"yes": 1, "no": 0}, inplace = True)
new_data.drop("region", axis = 1, inplace = True)

new_data.loc[:, ["age", "bmi", "children"]] = sc.transform(new_data.loc[:, ["age", "bmi", "children"]])
predict_value = gradient_model.predict(new_data)

print("________________________________________________")
print("Predicted Value :", round((predict_value[0]) ** (4), 2))
print("________________________________________________")

Enter Age :  32
Type your Gender:  male
Enter your BMI: 28.9
Enter number of children : 0
Are you a smoker ? no
In which region you belong: northeast


________________________________________________
Predicted Value : 4152.99
________________________________________________


In [102]:
import pickle
with open("insurance_gradient_model.pickle", "wb") as f:
    pickle.dump(gradient_model, f)

In [84]:
df

Unnamed: 0,age,sex,bmi,children,smoker,expenses
0,19,0,27.9,0,1,16884.92
1,18,1,33.8,1,0,1725.55
2,28,1,33.0,3,0,4449.46
3,33,1,22.7,0,0,21984.47
4,32,1,28.9,0,0,3866.86
...,...,...,...,...,...,...
1332,50,1,31.0,3,0,10600.55
1333,18,0,31.9,0,0,2205.98
1334,18,0,36.9,0,0,1629.83
1335,21,0,25.8,0,0,2007.95
