In [1]:
## Import relevant libraries for data processing & visualisation

import numpy as np              # linear algebra
import pandas as pd             # data processing, dataset file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization & graphical plotting
import seaborn as sns           # to visualize random distributions
import plotly.express as px
from datetime import datetime

%matplotlib inline

import warnings                 # to deal with warning messages
warnings.filterwarnings('ignore')

In [4]:
df_insurance_ini = pd.read_csv("insurance_data_kaggle_palooza.csv", sep=",")

In [5]:
df_insurance_ini.head()

Unnamed: 0,index,PatientID,age,gender,bmi,bloodpressure,diabetic,children,smoker,region,claim
0,0,1,39.0,male,23.2,91,Yes,0,No,southeast,1121.87
1,1,2,24.0,male,30.1,87,No,0,No,southeast,1131.51
2,2,3,,male,33.3,82,Yes,0,No,southeast,1135.94
3,3,4,,male,33.7,80,No,0,No,northwest,1136.4
4,4,5,,male,34.1,100,No,0,No,northwest,1137.01


In [21]:
df_insurance_ini.columns

Index(['index', 'PatientID', 'age', 'gender', 'bmi', 'bloodpressure',
       'diabetic', 'children', 'smoker', 'region', 'claim'],
      dtype='object')

In [6]:
df_insurance_ini.isna().sum()

index            0
PatientID        0
age              5
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           3
claim            0
dtype: int64

In [8]:
df_insurance_ini.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          1340 non-null   int64  
 1   PatientID      1340 non-null   int64  
 2   age            1335 non-null   float64
 3   gender         1340 non-null   object 
 4   bmi            1340 non-null   float64
 5   bloodpressure  1340 non-null   int64  
 6   diabetic       1340 non-null   object 
 7   children       1340 non-null   int64  
 8   smoker         1340 non-null   object 
 9   region         1337 non-null   object 
 10  claim          1340 non-null   float64
dtypes: float64(3), int64(4), object(4)
memory usage: 115.3+ KB


In [7]:
df_insurance_ini.describe()

Unnamed: 0,index,PatientID,age,bmi,bloodpressure,children,claim
count,1340.0,1340.0,1335.0,1340.0,1340.0,1340.0,1340.0
mean,669.5,670.5,38.078652,30.668955,94.157463,1.093284,13252.745642
std,386.968991,386.968991,11.102924,6.106735,11.434712,1.205334,12109.609288
min,0.0,1.0,18.0,16.0,80.0,0.0,1121.87
25%,334.75,335.75,29.0,26.275,86.0,0.0,4719.685
50%,669.5,670.5,38.0,30.4,92.0,1.0,9369.615
75%,1004.25,1005.25,47.0,34.7,99.0,2.0,16604.305
max,1339.0,1340.0,60.0,53.1,140.0,5.0,63770.43


In [10]:
print("The number of rows and number of columns are ", df_insurance_ini.shape)

The number of rows and number of columns are  (1340, 11)


In [11]:
## Checking the labels in categorical features
for col in df_insurance_ini.columns:
    if df_insurance_ini[col].dtype=='object':
        print()
        print(col)
        print(df_insurance_ini[col].unique())


gender
['male' 'female']

diabetic
['Yes' 'No']

smoker
['No' 'Yes']

region
['southeast' 'northwest' nan 'southwest' 'northeast']


## Data Wrangling

In [14]:
df = df_insurance_ini.drop(columns="index")


In [19]:
df.dropna(inplace=True)
print(f"Shape of Data after removing missing values: {df.shape}\n")
df.isna().sum()

Shape of Data after removing missing values: (1332, 10)



PatientID        0
age              0
gender           0
bmi              0
bloodpressure    0
diabetic         0
children         0
smoker           0
region           0
claim            0
dtype: int64

In [20]:
df.rename(columns={"PatientID": "id"}, inplace=True)

In [None]:
df['diabetic'] = df['diabetic'].replace({'Yes': 'diabetic', 'No': 'non-diabetic'})
df['smoker'] = df['smoker'].replace({'Yes': 'smoker', 'No': 'non-smoker'})

In [22]:
df["age"] = df["age"].astype(int)
df.info()

## Gruppieren

In [30]:
df.groupby(["region", "gender"]).agg({"gender": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,gender
region,gender,Unnamed: 2_level_1
northeast,female,112
northeast,male,119
northwest,female,164
northwest,male,181
southeast,female,224
southeast,male,218
southwest,female,162
southwest,male,152


In [32]:
df.groupby(["gender", "diabetic"]).agg({"gender": "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,gender
gender,diabetic,Unnamed: 2_level_1
female,No,339
female,Yes,323
male,No,356
male,Yes,314


## Data Visualisation

In [33]:
fig = px.histogram(df, x='region')
fig.show()

In [37]:
fig = px.histogram(df, x='region', color="gender", barmode="group")
fig.show()

In [42]:
fig = px.box(df, x="gender", y="age", color="smoker")
fig.show()

In [43]:
fig = px.box(df, x="region", y="claim", color="gender")
fig.show()

In [49]:
numerical_columns=["age", "bmi", "bloodpressure", "claim"]

for col in numerical_columns:
  fig = px.histogram(df, x=col)
  fig.update_layout(bargap=0.1)
  fig.show()

In [50]:
fig = px.scatter(df, x="age", y="claim", color="smoker")
fig.show()

In [51]:
fig = px.scatter(df, x="bmi", y="claim", color="gender")
fig.show()

In [52]:
fig = px.pie(df, values='claim', names='region')
fig.show()