In [2]:
# Questions to analyze
# 1. Cost difference between smokers and non-smokers and it's percentage increase 
# 2. Total number of male and female individuals and their percentage increase in cost due to the difference in genders 
# 3. Average age of individuals in the dataset 
# 4. Average cost for individuals with 0,1,2,3,4,5 and all of them 
# 5. The region having the most patients 


In [3]:
import csv

# This segment of code evaluates the average 
# insurance cost of smokers compared to a non_smoker.
# This was done in order to see the difference this 
# parameter makes in the overall cost. 

smoker_count = 0
nonsmoker_count = 0
sum_for_smokers = 0
sum_for_nonsmokers = 0
with open('insurance.csv') as insurance_data:
    headings = insurance_data.readline().strip('\n').split(',')
    reader = csv.reader(insurance_data)
    for row in reader:
        if row[4] == 'yes':
            smoker_count += 1
            sum_for_smokers += float(row[6])
        else:
            nonsmoker_count += 1
            sum_for_nonsmokers += float(row[6])
    avg_cost_for_smokers = round(sum_for_smokers / smoker_count,1)
    avg_cost_for_nonsmokers = round(sum_for_nonsmokers / nonsmoker_count,1)
    print('The average cost for smokers is ${}, while the average cost for a non-smoker is ${}'.format(avg_cost_for_smokers, avg_cost_for_nonsmokers))

The average cost for smokers is $32050.2, while the average cost for a non-smoker is $8434.3


In [4]:
# Evaluation of percentage increase of insurance cost 
# due to smoking

def percentage_increase(m,f):
    return round(((m-f)/f) * 100, 2)

percent_increase = percentage_increase(avg_cost_for_smokers, avg_cost_for_nonsmokers)
print('The percentage increase in insurance cost for smokers in relation to non smokers is {}%'.format(percent_increase))

The percentage increase in insurance cost for smokers in relation to non smokers is 280.0%


In [5]:
# This segment of code will be used to investigate the coorelation 
# between the insurance costs and gender of the individual in question

female_count = 0
male_count = 0
female_sum = 0
male_sum = 0
with open('insurance.csv') as insurance_data:
    headers = insurance_data.readline().strip('\n').split(',')
    reader = csv.reader(insurance_data)
    for row in reader:
        if row[1] == 'female':
            female_count += 1
            female_sum += float(row[6])
        else:
            male_count += 1
            male_sum += float(row[6])
    average_female = female_sum/female_count
    average_male = male_sum/male_count
    
print('The percentage increase in insurance cost between a female and a male is {}% increase, this would suggest that gender and many other factors will affect the total cost'.format(percentage_increase(average_male, average_female)))

The percentage increase in insurance cost between a female and a male is 11.04% increase, this would suggest that gender and many other factors will affect the total cost


In [6]:
# Now for a simple one, this segment will simply check the 
# average age of the individuals in the dataset 

age_sum = 0
age_count = 0
with open('insurance.csv') as insurance_data:
    header = insurance_data.readline()
    reader = csv.reader(insurance_data)
    for row in reader:
        age_sum += int(row[0])
        age_count += 1
average_age = int(age_sum/age_count)
print('The average age of the individuals in the dataset is {} years of age'.format(average_age))

The average age of the individuals in the dataset is 39 years of age


In [7]:
# For this section I set out to find the average cost of 
# insurance for those with differing numbers of children 
# due to the information possibly being useful for aspiring 
# parents 
children0 = children1 = children2 = children3 = children4 = children5 = 0
len0 = len1 = len2 = len3 = len4 = len5 = 0
with open('insurance.csv') as insurance_data:
    header = insurance_data.readline()
    reader = csv.reader(insurance_data)
    child_count_dict = {} 
    for row in reader:
        for x in range(6):
            if row[3] == '0':
                len0 += 1
                children0 += float(row[6])
            elif row[3] == '1':
                len1 += 1
                children1 += float(row[6])
            elif row[3] == '2':
                len2 += 1
                children2 += float(row[6])
            elif row[3] == '3':
                len3 += 1
                children3 += float(row[6])
            elif row[3] == '4':
                len4 += 1
                children4 += float(row[6])
            elif row[3] == '5':
                len5 += 1
                children5 += float(row[6])
child_count_dict['0'] = round(children0/len0,1)
child_count_dict['1'] = round(children1/len1,1)
child_count_dict['2'] = round(children2/len2,1)
child_count_dict['3'] = round(children3/len3,1)
child_count_dict['4'] = round(children4/len4,1)
child_count_dict['5'] = round(children5/len5,1)
above0 = children1 + children2 + children3 + children4 + children5
len0 = len1 + len2 + len3 + len4 + len5
child_count_dict['>'] = round(above0/len0,1)
print(child_count_dict)
    

{'0': 12366.0, '1': 12731.2, '2': 15073.6, '3': 15355.3, '4': 13850.7, '5': 8786.0, '>': 13949.9}


In [21]:
# There are a number of regions in the dataset 
# and as such I have set out to find the area with the most individuals 
southwest_counter = southeast_counter = northwest_counter = northeast_counter = max_region = 0
counterdict = {}

def findkey(dict,value):
    for key in dict:
        if dict[key] == value:
            return key

with open('insurance.csv') as insurance_data:
    reader = csv.DictReader(insurance_data)
    for data in reader:
        if data['region'] == 'northwest':
            northwest_counter += 1
        elif data['region'] == 'northeast':
            northeast_counter += 1
        elif data['region'] == 'southwest':
            southwest_counter += 1
        else:
            southeast_counter += 1
counterdict.update({'Southwest':southwest_counter, 'Southeast':southeast_counter, 'Northwest':northwest_counter, 'Northeast':northeast_counter})

for key,items in counterdict.items():
    if items > max_region:
        max_region = items
    else:
        continue
print('The region with the most individuals in the dataset is the "{}" area'.format(findkey(counterdict, max_region)))
        

{'Southwest': 325, 'Southeast': 364, 'Northwest': 325, 'Northeast': 324}
The region with the most individuals in the dataset is the "Southeast" area
