# Perform EDA on below insurance dataset

Link: https://www.kaggle.com/datasets/thedevastator/insurance-claim-analysis-demographic-and-health

Perform All steps of EDA and document conclusions and then perform feature engineering.

In [4]:
import numpy as np 
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import klib
import warnings

warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("insurance_data.csv", index_col=0)

In [None]:
df.info()

In [None]:
nulls_df = (df.isnull().sum()).to_frame("total_nulls")
nulls_df["nulls_ratio(%)"]= ((nulls_df["total_nulls"]/df.shape[0])*100).round(2)

In [None]:
nulls_df

In [None]:
df['region'].fillna(method='ffill', inplace=True)
df.head(20)

In [None]:
temp_df = df.groupby('region')['age'].mean().round(0)
ra_dict = temp_df.to_dict()
ra_dict

In [None]:
df['age'] = df['age'].fillna(df['region'].map(ra_dict))
df['age'] = df.age.astype('int64')

In [None]:
df.info()

## Column's Data Types

### **Numerical**: PatientID, age, bmi, bloodpressure, claim
### **Categorical**: gender, diabetic, smoker, children, region
### **Mixed**: None

## Univariate Analysis - Nemerical Columns

In [None]:
df[["PatientID","age", "bmi", "bloodpressure", "claim"]].describe().round(2)

### 1 - **age column**

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,5))
sns.histplot(data = df, x="age", element='bars', color = "#03045e", ax = axs[0])
sns.kdeplot(data = df, x="age", fill=True, color= "#0077b6", ax = axs[1])
sns.rugplot(data = df, x="age", color='orange', ax = axs[1])
sns.boxplot(data = df, x ="age", color = "#00b4d8", ax = axs[2])
plt.tight_layout()
plt.show()

### 2 - **bmi column**

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,5))
sns.histplot(data = df, x="bmi", element='bars', color = "#03045e", ax = axs[0])
sns.kdeplot(data = df, x="bmi", fill=True,color= "#0077b6", ax = axs[1])
sns.rugplot(data = df, x="bmi", color='orange', ax = axs[1])
sns.boxplot(data = df, x ="bmi", color = "#00b4d8", ax = axs[2])
plt.tight_layout()
plt.show()

### 3 - **bloodpressure Column**

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,5))
sns.histplot(data = df, x="bloodpressure", element='bars', color = "#03045e", ax = axs[0])
sns.kdeplot(data = df, x="bloodpressure", fill=True,color= "#0077b6", ax = axs[1])
sns.rugplot(data = df, x="bloodpressure", color='orange', ax = axs[1])
sns.boxplot(data = df, x ="bloodpressure", color = "#00b4d8", ax = axs[2])
plt.tight_layout()
plt.show()

### 4 - **claim Column**

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,5))
sns.histplot(data = df, x="claim", element='bars', color = "#03045e", ax = axs[0])
sns.kdeplot(data = df, x="claim", fill=True,color= "#0077b6", ax = axs[1])
sns.rugplot(data = df, x="claim", color='orange', ax = axs[1])
sns.boxplot(data = df, x ="claim", color = "#00b4d8", ax = axs[2])
plt.tight_layout()
plt.show()

In [None]:
skew_pID = df.PatientID.skew()
skew_age = df.age.skew()
skew_bmi = df.bmi.skew()
skew_bp = df.bloodpressure.skew()
skew_claim = df.claim.skew()

skew_df = pd.DataFrame({'PatientID':skew_pID, 'age': skew_age, 'bmi': skew_bmi, 'bp': skew_bp, 'claim':skew_claim}, index=["DataSkew"]).round(2)
skew_df


In [None]:
def count_outliers(column):

        q1 = column.quantile(0.25)
        q3 = column.quantile(0.75)
        iqr = q3 - q1
        lower_limit = q1 - 1.5 * iqr 
        upper_limit = q3 + 1.5 * iqr

        outlier_count = ((column < lower_limit) | (column > upper_limit)).sum()
        outlier_percent = (outlier_count/len(column) * 100).round(2)
     
        return pd.Series({'Outliers': outlier_count, 'Outliers(%)': outlier_percent})

numerics = df.select_dtypes(exclude=["object"])
outlier_result = numerics.apply(count_outliers)
outlier_result.reset_index()                                

## Findings
    
| Column | Skewness | Outliers | Analysis |
|:-------|---------:|:---------|:---------|
| PatientID | 0.00 | 0 | perfectly symmetrical distribution. IDs have been assigned on the basis of claim values. lowest possible claim has the lowest most ID and vice versa. That is why this column should be added in the univariate analysis. Later on we can work on claim amounts on the basis of ID's |
| age | 0.11 | 0 | Symmetric but Bimodal means this column has more than 1 mode values. This feature would be needed for further feature engineering to categorized the ages. skewness value shows that data is slightly positively skewed but 0.11 is close enough to 0 so its nearly symmetric without having outliers |
| bmi | 0.29 | 9<br>(0.67%) | distribution is slightly positively skewed. Though bulk of data is spreaded around the mean but we have 0.67% of outliers too that is negligible |
| bloodpressure | 1.48 | 62<br>(4.63%) | moderately positively skewed data with more outliers of 4.63%. this column has some high values that needs to be checked for authenticity of the data |
| claim | 1.52 | 141<br>(10.52%) | this is our dependent/target variable, moderately positively skewed with 10.52% of outliers which is alarming since this column has more outliers. Authenticity and outlier handling would matter a lot |
| Outliers ||| Data has been throughly checked. No dicrepensy or error found in outliers so outlier deletion is not recommended. Precious information would become lost otherwise.|

## Univariate Analysis - Categorical Columns

In [None]:
colors = ["#2a9d8f", "#e9c46a", "#f4a261", "#e76f51"]
fig, axs = plt.subplots(1,5, figsize = (15,5))

axs[0].pie(df.gender.value_counts(), labels = df.gender.value_counts().index, autopct = '%1.1f%%', colors = colors)
axs[0].set_title("gender")
axs[1].pie(df.diabetic.value_counts(), labels = df.diabetic.value_counts().index, autopct = '%1.1f%%', colors = colors)
axs[1].set_title("diabetic")
axs[2].pie(df.smoker.value_counts(), labels = df.smoker.value_counts().index, autopct = '%1.1f%%', colors = colors)
axs[2].set_title("smoker")
axs[3].pie(df.region.value_counts(), labels = df.region.value_counts().index, autopct = '%1.1f%%', colors = colors)
axs[3].set_title("region")
axs[4].pie(df.children.value_counts(), labels = df.children.value_counts().index, autopct = '%1.1f%%', colors = colors)
axs[4].set_title("children")
plt.tight_layout()
plt.show()

## Findings
    
| Column | Proportion | Analysis |
|:-------|:-----------|:------------|
| gender | male - 1.2% higher | mixed categories, no impact overall |
| diabetic | no-diabetic - 4.2% higher | falls under mixed categories, no prominent impact overall |
| smoker | non-smoker - 59.2% higher  | non-smoker category is way too high than smokers|
| region | southeast - 33.1% | southeast and northeast have large proportions than others |
| children | no children - 43.0%| people with no children are more present in the data |

## Bivariate Analysis | Numerical - Numerical Columns

In [None]:
fig, axs = plt.subplots(1,3,figsize=(15,5))
sns.scatterplot(data = df, x ="age", y = "claim", hue = "gender",ax = axs[0])
sns.scatterplot(data = df, x ="bmi", y = "claim", hue = "gender", ax = axs[1])
sns.scatterplot(data = df, x ="bloodpressure", y = "claim", hue = "gender", ax = axs[2])
plt.tight_layout()
plt.show()

In [None]:
sns.lmplot(data = df, x = "bloodpressure", y = "claim", height = 5, aspect = 1.5, hue = "gender")
plt.show()

In [None]:
age_to_claim_corr = df.age.corr(df.claim)
bmi_to_claim_corr = df.bmi.corr(df.claim)
bp_to_claim_corr = df.bloodpressure.corr(df.claim)
CorrData = pd.DataFrame({"age/claim":age_to_claim_corr, "bmi/claim": bmi_to_claim_corr, "bp/claim": bp_to_claim_corr}, index=["Correlation"]).round(2)
CorrData

In [None]:
df[df.claim > 15000]

## Findings
    
| Column | correlation coefficient| Analysis |
|:-------|-----------------------:|:---------|
| age and claim | -0.03 | very weak negative correlation |
| bmi and claim | 0.2 | very weak positive correlation |
| bloodpressure and claim | 0.53 | moderate positive correlation |
||| if bloodpressure & bmi increase, the claim value tends to increase as well |
||| this finding suggests that higher bloodpressure & bmi are associated with higher claim values |
||| lmplot is further showing the linear correlation between bloodpressue and claim |
||| till 15,000 of claim value there are huge number of people. Over 15,000 people become less but claim values become higher. 358 people out of 1340 are claiming more than 15,000 amount. claim column needs to perform feature engneering |



## Bivariate Analysis | Numerical - Categorical Columns

In [None]:
df.groupby("gender")["claim"].describe()

In [None]:
df.groupby("diabetic")["claim"].describe()

In [None]:
df.groupby("smoker")["claim"].describe()

In [None]:
df.groupby("region")["claim"].describe()

In [None]:
df.groupby("children")["claim"].describe()

In [None]:
fig, axs = plt.subplots(2,3, figsize = (15,10))
sns.kdeplot(data = df, x = "claim", hue = "gender", fill = True, common_norm=False,  alpha=.5, linewidth=1, ax = axs[0,0])
sns.kdeplot(data = df, x = "claim", hue = "diabetic", fill = True, common_norm=False,  alpha=.5, linewidth=1, ax = axs[0,1])
sns.kdeplot(data = df, x = "claim", hue = "smoker", fill = True, common_norm=False, alpha=.5, linewidth=1, ax = axs[0,2])
sns.kdeplot(data = df, x = "claim", hue = "region", fill = True, common_norm=False,  alpha=.5, linewidth=1, ax = axs[1,0])
sns.kdeplot(data = df, x = "claim", hue = "children", fill = True, common_norm=False,  alpha=.5, linewidth=1, ax = axs[1,1])
axs[1,2].axis('off')
plt.tight_layout()
plt.show()

## Findings
    
| Column | Important Category | Analysis |
|:-------|:------------------------|:---------|
| claim/gender | count | males tend to have higher claim amounts on average with greater spread/variability. No special impact so far. |
| claim/diabetic || to be diabetic or not to be is not adding any significant value in the claim but we will feature enginner this column further to analyze category wise |
| claim/smoker | all | smokers tend to have higher claim amounts on average with greater spread/variability and very high quartiles compared to non-smokers |
| claim/region | all| most of the claims are concentrated at lower amounts across all regions however the northeast region has a longer and somehow fat tail indicating high frequency of large value claims are being reported there |
| claim/children || families with no or 5 children show the highest density peak at lower claim amounts wherea families with 0 children have high ratio of data around 43% | 


## Bivariate Analysis | Categorical - Categorical Columns

In [None]:
counts = pd.crosstab(df.gender, df.region)
percentages = (pd.crosstab(df.gender, df.region, normalize='columns') * 100).round(2)
combined = counts.astype(str) + "\n(" + percentages.astype(str) + "%)"
cmap = sns.color_palette("rocket_r", as_cmap=True)
sns.heatmap(counts, annot=combined, fmt='', cmap=cmap)
plt.show()

In [None]:
counts = pd.crosstab(df.gender, df.children)
percentages = (pd.crosstab(df.gender, df.children, normalize='columns') * 100).round(2)
combined = counts.astype(str) + "\n(" + percentages.astype(str) + "%)"
cmap = sns.color_palette("rocket_r", as_cmap=True)
sns.heatmap(counts, annot=combined, fmt='', cmap=cmap)
plt.show()

## Feature Engineering

### 1 - **diabetic column**

In [None]:
def bmi_classification(bmi_value):
    if bmi_value < 18.5:
        return "under weight"
    elif bmi_value >= 18.5 and bmi_value <= 24.9:
        return "normal weight"
    elif bmi_value >= 25 and bmi_value <= 29.9:
        return "over weight"
    elif bmi_value >= 30 and bmi_value <= 34.9:
        return "moderate obesity"
    elif bmi_value >= 35 and bmi_value <= 39.9:
        return "severe obesity"
    else:
        return "morbid obesity"

df["bmi_class"] = df["bmi"].apply(lambda x:bmi_classification(x))

### 2 - **age column**

In [None]:
df.age.min()

In [None]:
df.age.max()

In [None]:
age_range = [18,25,45,65]
labels = ["adult", "middle-aged", "senior"]

df["age_class"] = pd.cut(df["age"], age_range, labels = labels, right = False)

### 3 - **bloodpressure column**

In [None]:
df.bloodpressure.min()

In [None]:
df.bloodpressure.max()

In [None]:
bp_range = [80,120,130,140,150]
labels = ["normal", "elevated","hypertension stage 1", "hypertension stage 2"]

df["bp_class"] = pd.cut(df["bloodpressure"], bp_range, labels = labels, right = False)

## Multivariate Analysis 

In [None]:
sns.heatmap(df.select_dtypes(include='number').corr(), annot=True)

In [None]:
df.drop(columns=["PatientID"], inplace=True)

In [None]:
# df["bp_pct_change"] = df.bloodpressure.pct_change()
# df["claim_pct_change"] = df.claim.pct_change()

# df['change_ratio'] = df["bp_pct_change"] / df["claim_pct_change"] 


In [None]:
# df.dropna()

In [None]:
sns.lineplot(data = df, x='bloodpressure', y='claim', hue='smoker')

In [None]:
sns.relplot(data = df, x='bloodpressure', y='claim', hue='smoker',  style='bp_class')

In [None]:
temp = df[df.claim > 30000]

In [None]:
sns.lineplot(data = temp, x='bloodpressure', y='claim', hue='smoker')

In [None]:
temp

In [None]:
g = sns.FacetGrid(data = df, col='bp_class', hue='smoker')
g.map(sns.scatterplot, 'bloodpressure', 'claim')
g.add_legend()
plt.tight_layout()
plt.show()

In [None]:
g = sns.PairGrid(data = df, hue='smoker', vars=['age','bmi', 'bloodpressure', 'claim'])
g.map_diag(sns.histplot)
g.map_upper(sns.scatterplot)
g.map_lower(sns.lineplot)
g.add_legend()
plt.tight_layout()
plt.show()

In [None]:
g = sns.JointGrid(data=df, x = 'bloodpressure', y='claim', hue='smoker')
g.plot(sns.lineplot, sns.barplot)

In [None]:
bp_range = [80,117,120,121,122,123,125,127,129,131,134]
labels = ["14-19", "20-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64"]

df["newbp_class"] = pd.cut(df["bloodpressure"], bp_range, labels = labels, right = False)

In [None]:
sns.scatterplot(data=df, x="newbp_class", y="claim")

In [None]:
df.age_class.value_counts()

In [None]:
df[df.bmi < 25]["gender"].value_counts()

In [None]:
df.groupby(["region", "gender"])["claim"].median()