In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import warnings

# Set up for Jupyter Notebook
%matplotlib inline

# Configuring styles and warnings
sns.set_style("dark")  # Setting the Seaborn style
warnings.filterwarnings('ignore')  # Suppress warnings

# Define color palettes
dark = sns.color_palette('dark')
bright = sns.color_palette('bright')
deep = sns.color_palette('deep')
pastel = sns.color_palette('pastel')


In [None]:
## Loading The Dataset Using `read_csv()`
df = pd.read_csv('insurance.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.hist(bins=20,figsize=(20,10));

In [None]:
categorical_columns = [feature for feature in df.columns if df[feature].dtype=='O']
categorical_columns

In [None]:
for col in categorical_columns:
    plt.figure(figsize=(10, 6))  # Set the figure size for better visibility
    ax = sns.countplot(x=col, data=df)  # Explicitly name the 'x' parameter
    plt.title(col)
    
    # Calculate the percentage labels
    total = len(df[col])  # Total counts
    for p in ax.patches:
        percentage = f'{100 * p.get_height() / total:.1f}%'  # Calculate percentage
        x = p.get_x() + p.get_width() / 2  # X position of text
        y = p.get_height()  # Height of the patch
        ax.annotate(percentage, (x, y), ha='center', va='bottom')  # Place text on plot
    
    plt.show()


In [None]:
df.columns

In [None]:
df.groupby('age')['charges'].mean().plot()

In [None]:
## Age vs BMI
plt.figure(figsize=(17,7))
sns.lineplot(data=df,x="age",y="bmi",hue="sex",palette='dark')
plt.title("Body mass index with the Age")
plt.show()

In [None]:
temp = df.groupby('sex')['charges'].mean()
temp.plot(kind='bar',color=['pink','brown'])

In [None]:
sns.barplot(data=df,x='smoker',y='charges',hue='sex')

In [None]:
plt.figure(figsize=(15,8))
sns.scatterplot(data=df,x='bmi',y='charges')

In [None]:
# Define a custom palette
custom_palette = ['#6a3d9a', '#ff7f00', '#b15928', '#e31a1c', '#33a02c', '#1f78b4']

# Create a boxen plot with custom colors
sns.catplot(x="children", y="charges", kind="boxen", data=df, height=10, palette=custom_palette)
plt.show()


In [None]:
# Using custom colors
custom_palette = ['#4a7ba9', '#55a868', '#c44e52', '#8172b3', '#ccb974', '#64b5cd']
sns.catplot(x="children", y="charges", kind="swarm", data=df, height=10, palette=custom_palette)
plt.show()


In [None]:
plt.figure(figsize=(12,5))
sns.countplot(data=df,x='region')
# ax = sns.countplot(data=df,x='region')
# ax.bar_label(ax.containers[0])

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(data=df,x='region',y='charges')

In [None]:
plt.figure(figsize=(12,5))
sns.barplot(data=df,x='region',y='charges',hue='smoker')

In [None]:
df[df['age']>50]['region'].value_counts().plot.barh(color=pastel,figsize=(10,8))
plt.title('Regions With Oldest People')

In [None]:
sns.pairplot(df,palette='pastel')

In [None]:
## Making a Group 
a=df.groupby(["sex","smoker","region"])["charges"].mean().round(2)
a

In [None]:
a.plot(kind="bar", figsize=(20,7),color=pastel)

In [None]:
df.columns

In [None]:
plt.figure(figsize=(17,7))
px.scatter(data_frame=df,
           x='bmi', 
           y='charges',
           color="sex",
           size="children",
           symbol='smoker',
           hover_name='region',
           text='age',
           title='Group Information Of Insurance Data On Different Scatter Points')

In [None]:
## Handling Categorical Data
categorical_columns

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col])

In [None]:
df.head()

In [None]:
## Data splitting
X = df.drop('charges',axis=1)
y = np.log(df['charges'])

In [None]:
X

In [None]:
y.head()

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train.shape


In [None]:
X_test.shape


In [None]:
from sklearn.metrics import mean_absolute_error, r2_score


In [None]:
from sklearn.linear_model import LinearRegression  

linreg=LinearRegression()
linreg.fit(X_train,y_train)

print("Score the X-train with Y-train is : ", linreg.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", linreg.score(X_test,y_test))

y_pred = linreg.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge()
ridge.fit(X_train,y_train)

print("Score the X-train with Y-train is : ", ridge.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", ridge.score(X_test,y_test))

y_pred = ridge.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100,random_state=42)
rfr.fit(X_train,y_train)


print("Score the X-train with Y-train is : ", rfr.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", rfr.score(X_test,y_test))

y_pred = rfr.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
etr = ExtraTreesRegressor(n_estimators=100,random_state=42)
etr.fit(X_train,y_train)


print("Score the X-train with Y-train is : ", etr.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", etr.score(X_test,y_test))

y_pred =etr.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from sklearn.ensemble import AdaBoostRegressor
abr = AdaBoostRegressor(random_state=42)
abr.fit(X_train,y_train)

print("Score the X-train with Y-train is : ", abr.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", abr.score(X_test,y_test))

y_pred = abr.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=42)
gbr.fit(X_train,y_train)

print("Score the X-train with Y-train is : ", gbr.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", gbr.score(X_test,y_test))

y_pred = gbr.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
from xgboost import XGBRegressor
xgb=XGBRegressor(random_state=42)

xgb.fit(X_train,y_train)

print("Score the X-train with Y-train is : ", xgb.score(X_train,y_train))
print("Score the X-test  with Y-test  is : ", xgb.score(X_test,y_test))

y_pred = xgb.predict(X_test)
print("MAE: " ,mean_absolute_error(y_test,y_pred))

In [None]:
import pickle 
pickle.dump(gbr, open('gbr-model', 'wb'))

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'learning_rate':[0.5,0.10,0.01,0.1],
             'n_estimators':[25,50,75,100,125],
              'max_depth':[5,7,9,10],
              'subsample':[1,2],
              'min_samples_split':[1,2,3],
              'min_samples_leaf':[1,1.5,2],
              'max_depth':[5,7,9,10]
              
             }

g_search = GridSearchCV(estimator = gbr, param_grid = param_grid,cv = 3, n_jobs = 1,verbose = True, return_train_score=True)
g_search.fit(X_train, y_train);

print(g_search.best_params_)
print(g_search.score(X_test, y_test))

In [None]:
y_pred = g_search.predict(X_test)
print("R2 Score Gradient Boost Regressor" ,r2_score(y_test,y_pred))

In [None]:
import pickle 
pickle.dump(g_search, open('model.pkl', 'wb'))

In [None]:
loaded_model = pickle.load(open("model.pkl", "rb"))
loaded_model.predict(X_test)
loaded_model.score(X_test,y_test)

In [None]:
age = int(input("Enter Your Age \n"))
sex = int(input("What's Your Gender(1:Male 0:Female) \n"))
bmi = float(input("Enter Your Body Mass Index \n"))
children = int(input("How Many Childrens Your Have (If None Enter 0) \n"))
smoker = int(input("Do You Smoke? (1: Yes 0:No) \n"))
region = int(input("What's You Region(northeast:0,northwest:1,southeast:2,southwest:3) \n"))

data = [age,sex,bmi,children,smoker,region]
new_data = pd.DataFrame([data],columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])
prediction = loaded_model.predict(new_data)
print(np.exp(prediction[0]))