# U.S. Medical Insurance Costs
## Project Scoping (initial)
### Problem
    Study insurance dataset and draw conclusions about patients, thier attributes like sex, bmi, children, smoker status and the effect on insurance costs.
### Goals
    Provide a report(s) of patient and their insurance costs with different possible dimensions that provide insights into factors that affect the insurance costs and the groups and geographical area they belong.
### Data
    Data is already available as a CSV file. 
        . Import the CSV file into python data structure
        . Organize and prepare the data ready for analysis.
### Analysis
#### Some key analysis but not limited to:
    . Average age of patients.
        . all
        . male
        . female
        . smoker
        . non-smoker
        . patient with at least one child
        . region
    . Average insurance cost
        . all
        . male
        . female
        . smoker
        . non-smoker
        . patient with at least one child
        . region
    . Average BMI
    . Youngest age of smoker.
    . Oldest smoker.
    . Smoker count
        . all
        . male
        . female
        . patient with at least one child
        . region
    . Regional
        . Average insurance cost
        . Smokers count in the region
        . Patients with at least one child in the region
        . Average age
        . Total no.of children
    . Combine all the analysis in a single dictionary

In [1]:
# Read the data from CSV into a list
import csv
insurance_list  = []
with open("insurance.csv") as insurance_recs:
    insurance_list = [line for line in csv.DictReader(insurance_recs)]
# print first 10 rows to check the result
print(insurance_list[0:10])

[{'age': '19', 'sex': 'female', 'bmi': '27.9', 'children': '0', 'smoker': 'yes', 'region': 'southwest', 'charges': '16884.924'}, {'age': '18', 'sex': 'male', 'bmi': '33.77', 'children': '1', 'smoker': 'no', 'region': 'southeast', 'charges': '1725.5523'}, {'age': '28', 'sex': 'male', 'bmi': '33', 'children': '3', 'smoker': 'no', 'region': 'southeast', 'charges': '4449.462'}, {'age': '33', 'sex': 'male', 'bmi': '22.705', 'children': '0', 'smoker': 'no', 'region': 'northwest', 'charges': '21984.47061'}, {'age': '32', 'sex': 'male', 'bmi': '28.88', 'children': '0', 'smoker': 'no', 'region': 'northwest', 'charges': '3866.8552'}, {'age': '31', 'sex': 'female', 'bmi': '25.74', 'children': '0', 'smoker': 'no', 'region': 'southeast', 'charges': '3756.6216'}, {'age': '46', 'sex': 'female', 'bmi': '33.44', 'children': '1', 'smoker': 'no', 'region': 'southeast', 'charges': '8240.5896'}, {'age': '37', 'sex': 'female', 'bmi': '27.74', 'children': '3', 'smoker': 'no', 'region': 'northwest', 'charges'

Data is now in a list ready for further break-down and analysis <br>
* Build analytical/summary functions with parameters to summarize at different levels </p>
* Utilise the functions with appropriate parameters for analysis at that level

In [2]:
# Function to get average of patients based on criteria in key_value pair. 
# If key_value is blank then get average for the whole list.
# Example usage: get_average_age(insurance_list, {"sex":"male"})

def get_average_age(insurance_list, key_value=None):
    age = 0
    count = 0
    # if key_value is blank, return average age for all patients
    if not key_value:
        for insured in insurance_list:
            age += int(insured["age"])
            count += 1
    else:
        for key,value in key_value.items():
#             print(key, value)
            for insured in insurance_list:
                if key == "children":
                    if int(insured[key]) >= int(value):
                        age += int(insured["age"])
                        count += 1
                else:
                    if insured[key] == value:
                        age += int(insured["age"])
                        count += 1
    # Return only if record exists to avoid zero divide error
    if age > 0:
        return int(age /count)

# Function to get average insurance cost.     
# If key_value is blank then get average for the whole list.
# Example usage: get_average_insurance_cost(insurance_list)
def get_average_insurance_cost(insurance_list, key_value=None):
    insurance_cost = 0.0
    count = 0
    if not key_value:
        for insured in insurance_list:
            insurance_cost += float(insured["charges"])
            count += 1
    else:
        for key,value in key_value.items():
#             print(key, value)
            for insured in insurance_list:
                if key == "children":
                    if int(insured[key]) >= int(value):
                        insurance_cost += float(insured["charges"])
                        count += 1
                else:
                    if insured[key] == value:
                        insurance_cost += float(insured["charges"])
                        count += 1
    # Return only if record exists to avoid zero divide error
    if insurance_cost > 0:
        return round((insurance_cost /count), 2)    

# Function to get average BMI.     
# Example usage: get_average_BMI(insurance_list)
def get_average_insurance_BMI(insurance_list):
    bmi = 0
    for insured in insurance_list:
        bmi += float(insured.get("bmi"))
    return round(bmi /len(insurance_list), 2)  

#Function to get youngest smoker
def get_youngest_smoker(insurance_list):
    youngest_smoker = 200
    for insured in insurance_list:
        if insured["smoker"] == "yes" and int(insured["age"]) < youngest_smoker:
            youngest_smoker = int(insured["age"])
    return youngest_smoker

#Function to get oldest smoker
def get_oldest_smoker(insurance_list):
    oldest_smoker = 0
    for insured in insurance_list:
        if insured["smoker"] == "yes" and int(insured["age"]) > oldest_smoker:
            oldest_smoker = int(insured["age"])
    return oldest_smoker

# Function to get smoker count (all, male, female, patient with children)
def get_smoker_count(insurance_list, key_value=None):
    smoker_count = 0
    if not key_value:
        key = 0
    else:
        for key, value in key_value.items():
            key, value   
            
    for insured in insurance_list:
        if insured["smoker"] == "yes":
            if key == 0:
                smoker_count += 1
            elif key == "children" and int(insured[key]) >= int(value):
                smoker_count += 1
            elif insured[key] == value:
                smoker_count += 1
    return smoker_count

# Helper function to get all regions
def get_regions(insurance_list):    
    region_list = []
    for insured in insurance_list:
        if not insured["region"] in region_list:
            region_list.append(insured["region"])
    return region_list

# Helper function to get those with at least one children
# If key_value not mentioned, it gets for the entire patient population otherwise filtered as per key_value
# valid key_value pairs: {"sex":"male"}, {"sex", "female"}, {"smoker":"yes"}, {"smoker":"yes"}, {"region":<any valid region in the insurance_list>}
def get_patients_with_children(insurance_list, key_value=None):
    count = 0
    if not key_value:
        key = 0
    else:
        for key,value in key_value.items():
            key,value
        
    for insured in insurance_list:
        if key == 0:
            count += 1
        else:
            if insured[key] == value:
                count += 1
    return count

# Helper function get total count of children per region
def get_children_count(insurance_list, region):
    count = 0
    for insured in insurance_list:
        if insured["region"] == region:
#             print(insured["region"])
            count += 1
    return count

# Function to get regional level summaries in a dictionary for:
    # Average insurance cost
    # Smokers count in the region
    # Patients with at least one child in the region
    # Average age
    # Total no.of children
def get_region_summary(insurance_list, region):
    region_dict = {"region":region}
    avg_insurance_cost = get_average_insurance_cost(insurance_list, {"region":region})

    region_dict["avg_insurance_cost"] = avg_insurance_cost
    smokers_count = get_smoker_count(insurance_list, {"region":region})
    
    region_dict["smokers_count"] = smokers_count
    with_children = get_patients_with_children(insurance_list, {"region":region})

    region_dict["patients_with_children"] = with_children
    avg_age = get_average_age(insurance_list, {"region":region})

    region_dict["avg_age"] = avg_age
    total_children = get_children_count(insurance_list, region)

    region_dict["total_children"] = total_children
    return region_dict

Check average age for all, male and female

In [3]:
# average_age = get_average_age(insurance_list=insurance_list,key_value=None)    
print("Average age of all: " + str(get_average_age(insurance_list)))
print("Average age of male : " + str(get_average_age(insurance_list=insurance_list, key_value={"sex":"male"})))
print("Average age of female: " + str(get_average_age(insurance_list=insurance_list, key_value={"sex":"female"})))

Average age of all: 39
Average age of male : 38
Average age of female: 39


Test get_average_insurance_cost function by printing average insurance cost for a region

In [4]:
print("Average insurance cost for southwest region: " + str(get_average_insurance_cost(insurance_list, key_value={"region":"southwest"})))

Average insurance cost for southwest region: 12346.94


Average BMI is not a prominent indicator, hence overall average BMI is good enough

In [5]:
print("Average BMI for the entire patient population: " + str(get_average_insurance_BMI(insurance_list=insurance_list)))

Average BMI for the entire patient population: 30.66


Check the age of youngest smoker as smoking can have a significant impact on insurance costs.

In [6]:
print("Youngest smoker: " + str(get_youngest_smoker(insurance_list=insurance_list)))

Youngest smoker: 18


Smoker status alongwith age adds to the insurance cost even higher. It is an important factor too.

In [7]:
print("Oldest smoker: " + str(get_oldest_smoker(insurance_list=insurance_list)))

Oldest smoker: 64


Overall insurance premium costs can vary depending on the no of smokers

In [8]:
print("Smoker count for all patients: " + str(get_smoker_count(insurance_list=insurance_list)))

Smoker count for all patients: 274


Test get_children_count function that will be used in Region level summaries

In [9]:
print(get_children_count(insurance_list, "southeast"))

364


The major contributors to insurance costs are:
* smoker status
* age
* dependent children. 

Only these factors that are included at regional level. The others can be obtained at country level. 

In [10]:
region_list = get_regions(insurance_list)
for region in region_list:
    print(get_region_summary(insurance_list, region))

{'region': 'southwest', 'avg_insurance_cost': 12346.94, 'smokers_count': 58, 'patients_with_children': 325, 'avg_age': 39, 'total_children': 325}
{'region': 'southeast', 'avg_insurance_cost': 14735.41, 'smokers_count': 91, 'patients_with_children': 364, 'avg_age': 38, 'total_children': 364}
{'region': 'northwest', 'avg_insurance_cost': 12417.58, 'smokers_count': 58, 'patients_with_children': 325, 'avg_age': 39, 'total_children': 325}
{'region': 'northeast', 'avg_insurance_cost': 13406.38, 'smokers_count': 67, 'patients_with_children': 324, 'avg_age': 39, 'total_children': 324}


Combine all the analysis in a dictionary so that overall country wide summaries and regional level summaries are accessible:
* The format will be as:
    * {Countrywide: [{Average age of patients}, {Average insurance cost},{Average_BMI}, {Youngest smoker}, {Oldest Smoker}, {Smokers count}], Regionwide: [{Regional summaries}]}

In [11]:
# create average age dictionary
average_age_dict = {"avg_age": {"all": get_average_age(insurance_list), 
                                "male":get_average_age(insurance_list=insurance_list, key_value={"sex":"male"}),
                                "female":get_average_age(insurance_list=insurance_list, key_value={"sex":"female"}),
                                "smoker":get_average_age(insurance_list=insurance_list, key_value={"smoker":"yes"}),
                                "non-smoker":get_average_age(insurance_list=insurance_list, key_value={"smoker":"no"}),
                                "with_children":get_average_age(insurance_list=insurance_list, key_value={"children":"1"})
                                   }}
# print(average_age_dict)

# create average insurance cost dictionary
avg_ins_cost_dic = {"avg_ins_cost":{"all":get_average_insurance_cost(insurance_list),
                                    "male":get_average_insurance_cost(insurance_list, {"sex":"male"}),
                                    "female":get_average_insurance_cost(insurance_list, {"sex":"female"}),
                                    "smoker":get_average_insurance_cost(insurance_list, {"smoker":"yes"}),
                                    "non-smoker":get_average_insurance_cost(insurance_list, {"smoker":"no"}),
                                    "with_children":get_average_insurance_cost(insurance_list, {"children":"1"})
                                               }}
# print(avg_ins_cost_dic)

# create avg_bmi dict
avg_bmi_dict = {"avg_bmi":get_average_insurance_BMI(insurance_list)}
# print(avg_bmi_dict)

# youngest smoker dict
youngest_smoker_dict = {"youngest_smoker":get_youngest_smoker(insurance_list)}
# print(youngest_smoker_dict)

# oldest smoker dict
oldest_smoker_dict = {"oldest_smoker":get_oldest_smoker(insurance_list)}
# print(oldest_smoker_dict)

# smokers count dict
smokers_count = {"smokers":get_smoker_count(insurance_list)}
# print(smokers_count)

# create a list of all the values for country level
country_summary_list = [average_age_dict, avg_ins_cost_dic, avg_bmi_dict, youngest_smoker_dict, oldest_smoker_dict, smokers_count]
# print(country_summary_list)

# create a list of all the values for regional level
region_summary_list = [get_region_summary(insurance_list, region) for region in (get_regions(insurance_list))]
# print(region_summary_list)

# Finally create master summary dictionary that holds country and regional level summaries
final_dict = {"Countrywide":country_summary_list, "Regionwide":region_summary_list}
print(final_dict)

{'Countrywide': [{'avg_age': {'all': 39, 'male': 38, 'female': 39, 'smoker': 38, 'non-smoker': 39, 'with_children': 39}}, {'avg_ins_cost': {'all': 13270.42, 'male': 13956.75, 'female': 12569.58, 'smoker': 32050.23, 'non-smoker': 8434.27, 'with_children': 13949.94}}, {'avg_bmi': 30.66}, {'youngest_smoker': 18}, {'oldest_smoker': 64}, {'smokers': 274}], 'Regionwide': [{'region': 'southwest', 'avg_insurance_cost': 12346.94, 'smokers_count': 58, 'patients_with_children': 325, 'avg_age': 39, 'total_children': 325}, {'region': 'southeast', 'avg_insurance_cost': 14735.41, 'smokers_count': 91, 'patients_with_children': 364, 'avg_age': 38, 'total_children': 364}, {'region': 'northwest', 'avg_insurance_cost': 12417.58, 'smokers_count': 58, 'patients_with_children': 325, 'avg_age': 39, 'total_children': 325}, {'region': 'northeast', 'avg_insurance_cost': 13406.38, 'smokers_count': 67, 'patients_with_children': 324, 'avg_age': 39, 'total_children': 324}]}
