## Import packages

In [1]:
import pandas as pd
from pandas import DataFrame

## Assign constant values

In [2]:
FILE_PATH = "./insurance.csv"
AGE_GROUPS = ["Teenager", "Young adult", "Middle-aged adult", "Senior citizen"]
SMOKER_VALUES = {"yes": True, "no": False}
BMI_GROUPS = ["Underweight", "Normal weight", "Overweight", "Obese"]

## Create a DataFrame

In [3]:
df = pd.read_csv(FILE_PATH)

## Explore the imported dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Rename column names

In [5]:
df.rename(
    columns=
    {
        "age": "Age",
        "sex": "Sex",
        "bmi": "BMI",
        "children": "Children",
        "smoker": "Smoker",
        "region": "Region",
        "charges": "Charges",
    },
    inplace=True,
)

df.head()

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Replace the values of the "Smoker" column

In [6]:
smoker_condition = lambda smoker: SMOKER_VALUES.get(smoker)

df["Smoker"] = df["Smoker"].apply(smoker_condition)

df.head()

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Charges
0,19,female,27.9,0,True,southwest,16884.924
1,18,male,33.77,1,False,southeast,1725.5523
2,28,male,33.0,3,False,southeast,4449.462
3,33,male,22.705,0,False,northwest,21984.47061
4,32,male,28.88,0,False,northwest,3866.8552


## Useful functions

In [7]:
def get_percentage(column_name: str, search_value: str) -> float:
    """Get the percentage of the specified column value.
    
    :param column_name: str: Specify the column name
    :param search_value: str: Specify the search value
    :return: Percentage for the search value, rounded to 3 decimal places
    """
    percentage = round(
        sum(df[column_name] == search_value) / df[column_name].count() * 100,
        3,
    )
    
    return percentage

In [8]:
def get_average_value_by_column(column_name: str, decimals: int = 0) -> float:
    """Get the average value of the specified column name.
    
    :param column_name: str: Specify the column name
    :param decimals: int: Specify the number of decimal places
    :return: Average value rounded to the specified decimal places
    """
    average_value_by_column = round(df[column_name].mean(), decimals)
    
    return average_value_by_column

In [9]:
def get_average_value_by_group(
        group_by_column_name: str, aggregate_by_column_name: str
) -> DataFrame:
    """Get the average value for each group.
    
    :param group_by_column_name: str: Specify the column name by which to
        group the DataFrame
    :param aggregate_by_column_name: Specify the column name by which to
        aggregate the DataFrame
    :return: 
    """
    average_value_by_group = (
        df.groupby(group_by_column_name)[aggregate_by_column_name]
        .mean()
        .round(decimals=3)
        .reset_index()
    )
    
    return average_value_by_group

## Percentage of males and females in the dataset

In [10]:
male_percentage = get_percentage(column_name="Sex", search_value="male")
female_percentage = get_percentage(column_name="Sex", search_value="female")

print(f"The percentage of males in the dataset is {male_percentage}%")
print(f"The percentage of females in the dataset is {female_percentage}%")

The percentage of males in the dataset is 50.523%
The percentage of females in the dataset is 49.477%


## Add age groups to the dataset

In [11]:
age_condition = (
    lambda age: AGE_GROUPS[0]
    if age < 20
    else AGE_GROUPS[1]
    if 20 <= age < 40
    else AGE_GROUPS[2]
    if 40 <= age < 60
    else AGE_GROUPS[3]
)
df["Age Group"] = df["Age"].apply(func=age_condition)

df.head()

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Charges,Age Group
0,19,female,27.9,0,True,southwest,16884.924,Teenager
1,18,male,33.77,1,False,southeast,1725.5523,Teenager
2,28,male,33.0,3,False,southeast,4449.462,Young adult
3,33,male,22.705,0,False,northwest,21984.47061,Young adult
4,32,male,28.88,0,False,northwest,3866.8552,Young adult


## Percentage of people in each age group

In [12]:
percentages_age_groups = [
    get_percentage(column_name="Age Group", search_value=age_group)
    for age_group in AGE_GROUPS
]

for age_group, percentage_age_group in zip(AGE_GROUPS, percentages_age_groups):
    print(
        f'The percentage of "{age_group}" group in the dataset is {percentage_age_group}%'
    )

The percentage of "Teenager" group in the dataset is 10.239%
The percentage of "Young adult" group in the dataset is 40.135%
The percentage of "Middle-aged adult" group in the dataset is 41.106%
The percentage of "Senior citizen" group in the dataset is 8.52%


## Average age in the dataset

In [13]:
average_age = get_average_value_by_column(column_name="Age")

print(f"The average age in the dataset is {average_age} years")

The average age in the dataset is 39.0 years


## Add BMI groups to the dataset

In [14]:
bmi_condition = (
    lambda bmi: BMI_GROUPS[0]
    if bmi < 18.5
    else BMI_GROUPS[1]
    if 18.5 <= bmi < 25
    else BMI_GROUPS[2]
    if 25 <= bmi < 30
    else BMI_GROUPS[3]
)

df["BMI Group"] = df["BMI"].apply(bmi_condition)

df.head()

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region,Charges,Age Group,BMI Group
0,19,female,27.9,0,True,southwest,16884.924,Teenager,Overweight
1,18,male,33.77,1,False,southeast,1725.5523,Teenager,Obese
2,28,male,33.0,3,False,southeast,4449.462,Young adult,Obese
3,33,male,22.705,0,False,northwest,21984.47061,Young adult,Normal weight
4,32,male,28.88,0,False,northwest,3866.8552,Young adult,Overweight


## Percentage of people in each BMI group

In [15]:
percentages_bmi_groups = [
    get_percentage(column_name="BMI Group", search_value=bmi_group)
    for bmi_group in BMI_GROUPS
]

for bmi_group, percentage_bmi_group in zip(BMI_GROUPS, percentages_age_groups):
    print(
        f'The percentage of "{bmi_group}" group in the dataset is {percentage_bmi_group}%'
    )

The percentage of "Underweight" group in the dataset is 10.239%
The percentage of "Normal weight" group in the dataset is 40.135%
The percentage of "Overweight" group in the dataset is 41.106%
The percentage of "Obese" group in the dataset is 8.52%


## Average BMI in the dataset

In [16]:
average_bmi = get_average_value_by_column(column_name="BMI", decimals=3)

print(f"The average BMI in the dataset is {average_bmi}")

The average BMI in the dataset is 30.663


## Percentage of people in each region

In [17]:
regions = list(df["Region"].unique())

percentages_regions = [
    get_percentage(column_name="Region", search_value=region)
    for region in regions
]

for region, percentage_region in zip(regions, percentages_regions):
    print(
        f'The percentage of "{region}" region in the dataset is {percentage_region}%'
    )

The percentage of "southwest" region in the dataset is 24.29%
The percentage of "southeast" region in the dataset is 27.205%
The percentage of "northwest" region in the dataset is 24.29%
The percentage of "northeast" region in the dataset is 24.215%


## Average charges in the dataset

In [18]:
average_charges = get_average_value_by_column(column_name="Charges", decimals=3)

print(f"The average cost of health insurance in the data set is {average_charges}")

The average cost of health insurance in the data set is 13270.422


## Average charges by sex

In [19]:
average_charges_by_sex = get_average_value_by_group(
    group_by_column_name="Sex", aggregate_by_column_name="Charges"
)

print(average_charges_by_sex)

      Sex    Charges
0  female  12569.579
1    male  13956.751


## Average charges by region

In [20]:
average_charges_by_region = get_average_value_by_group(
    group_by_column_name="Region", aggregate_by_column_name="Charges"
)

print(average_charges_by_region)

      Region    Charges
0  northeast  13406.385
1  northwest  12417.575
2  southeast  14735.411
3  southwest  12346.937


## Average charges by age group

In [21]:
average_charges_by_age_group = get_average_value_by_group(
    group_by_column_name="Age Group", aggregate_by_column_name="Charges"
)

print(average_charges_by_age_group)

           Age Group    Charges
0  Middle-aged adult  15431.974
1     Senior citizen  21248.022
2           Teenager   8407.349
3        Young adult  10603.646


## Average charges by BMI group

In [22]:
average_charges_by_bmi_group = get_average_value_by_group(
    group_by_column_name="BMI Group", aggregate_by_column_name="Charges"
)

print(average_charges_by_bmi_group)

       BMI Group    Charges
0  Normal weight  10409.338
1          Obese  15552.335
2     Overweight  10987.510
3    Underweight   8852.201
