In [60]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("mirichoi0218/insurance")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\spenc\.cache\kagglehub\datasets\mirichoi0218\insurance\versions\1


In [61]:
fileName = "insurance.csv"
try:
    df = pd.read_csv(path + "/" + fileName)
except:
    print(f"Data not found, make sure to run the medical_cost_data_preprocessing.ipynb file in its entirety to retrieve the data")

In [62]:
print("Dataframe info")
print(df.info())

print("-------------------------------------")

print("Dataframe stats")
print(df.describe())

print("-------------------------------------")

print("Dataframe dimensions")
print(df.shape)

print("-------------------------------------")

print(f"Dataframe")
df

Dataframe info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
None
-------------------------------------
Dataframe stats
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [63]:
print("Number of unique regions")
print(df['region'].value_counts())

print("-------------------------------------")

print("Number of unique price charges")
print(df['charges'].value_counts())

Number of unique regions
region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64
-------------------------------------
Number of unique price charges
charges
1639.56310     2
16884.92400    1
29330.98315    1
2221.56445     1
19798.05455    1
              ..
7345.08400     1
26109.32905    1
28287.89766    1
1149.39590     1
29141.36030    1
Name: count, Length: 1337, dtype: int64


In [64]:
# Encode data!
df_encoded = df.copy()
# Encode binary features to integer 1 or 0
df_encoded['sex'] = df_encoded['sex'].map({'male': 1, 'female': 0})
df_encoded['smoker'] = df_encoded['smoker'].map({'yes': 1, 'no': 0})

# Onehot encode region
region_onehot = pd.get_dummies(df_encoded['region'],drop_first=True).astype(int)
df_encoded = pd.concat([df_encoded, region_onehot], axis =1)
# drop the original region column
df_encoded = df_encoded.drop('region',axis=1)

# Let's add some more features that can be inferred from the data
df_encoded['is_obese'] = (df_encoded['bmi'] > 30).astype(int)
df_encoded['senior'] = (df_encoded['age'] > 55).astype(int)
df_encoded['multi_children'] = (df_encoded['children'] >= 2).astype(int)

df_encoded['age_squared'] = df_encoded['age']**2
df_encoded['bmi_smoker'] = df_encoded['bmi'] * df_encoded['smoker']
df_encoded['age_smoker'] = df_encoded['age'] * df_encoded['smoker']

df_encoded

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest,is_obese,senior,multi_children,age_squared,bmi_smoker,age_smoker
0,19,0,27.900,0,1,16884.92400,0,0,1,0,0,0,361,27.90,19
1,18,1,33.770,1,0,1725.55230,0,1,0,1,0,0,324,0.00,0
2,28,1,33.000,3,0,4449.46200,0,1,0,1,0,1,784,0.00,0
3,33,1,22.705,0,0,21984.47061,1,0,0,0,0,0,1089,0.00,0
4,32,1,28.880,0,0,3866.85520,1,0,0,0,0,0,1024,0.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,1,0,0,1,0,1,2500,0.00,0
1334,18,0,31.920,0,0,2205.98080,0,0,0,1,0,0,324,0.00,0
1335,18,0,36.850,0,0,1629.83350,0,1,0,1,0,0,324,0.00,0
1336,21,0,25.800,0,0,2007.94500,0,0,1,0,0,0,441,0.00,0


In [69]:
DATA_DIRECTORY = '../Data/Medical-Cost-Data'
OUTPUT_FILE = os.path.join(DATA_DIRECTORY, 'medical_cost.csv')

os.makedirs(DATA_DIRECTORY, exist_ok=True)

# Don't save the index in CSV, pandas reappends index for future applications
df_encoded.to_csv(OUTPUT_FILE, index=False)