# U.S. Medical Insurance Costs

In [3]:
import pandas as pd
import csv

### Load the file

In [4]:
insurance = pd.read_csv(r"C:\Users\Mantis\Codecademy Projects\Python Portfolio Project\python-portfolio-project-starter-files\insurance.csv")

In [5]:
# alternative
headers = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
insurance_dict = {header: [] for header in headers}

with open(r"C:\Users\Mantis\Codecademy Projects\Python Portfolio Project\python-portfolio-project-starter-files\insurance.csv", 'r') as insurance_raw:
    insurance_csv = csv.DictReader(insurance_raw)
    
    for row in insurance_csv:
        for header in headers:   
            insurance_dict[header].append(row[header])        

### Check for data/types, nulls, describe data

In [6]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Test some hypotesis

Some possible ideas for analysis are the following:

- Find out the average age of the patients in the dataset.
- Analyze where a majority of the individuals are from.
- Look at the different costs between smokers vs. non-smokers.
- Figure out what the average age is for someone who has at least one child in this dataset.

In [8]:
# average age of the patients
insurance['age'].mean()

39.20702541106129

In [9]:
# alternative average age of the patients
insurance_dict_age = list(map(int, insurance_dict['age'])) # to int
sum(insurance_dict_age) / len(insurance_dict_age)

39.20702541106129

In [10]:
# where a majority of the individuals are from
insurance.region.value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [11]:
# alternative where a majority of the individuals are from
insurance_dict_unique = list(set(insurance_dict['region'])) # unique values
{key: insurance_dict['region'].count(key) for key in insurance_dict_unique}

{'southeast': 364, 'southwest': 325, 'northeast': 324, 'northwest': 325}

In [12]:
# different costs between smokers vs. non-smokers.
insurance \
    .groupby('smoker') \
    .agg('mean') \
    .charges

smoker
no      8434.268298
yes    32050.231832
Name: charges, dtype: float64

In [13]:
# alternative different costs between smokers vs. non-smokers.
sum_smoker, sum_nonsmoker, count_smoker, count_nonsmoker = 0, 0, 0, 0

for row in range(len(insurance_dict['smoker'])):
    if insurance_dict['smoker'][row] == 'no':
        count_nonsmoker += 1
        sum_nonsmoker += float(insurance_dict['charges'][row])
    else:
        count_smoker += 1
        sum_smoker += float(insurance_dict['charges'][row])

print(f'smoker average charges: {sum_smoker/count_smoker}\nnonsmoker average charges: {sum_nonsmoker/count_nonsmoker}')

smoker average charges: 32050.23183153285
nonsmoker average charges: 8434.268297856199


In [90]:
# average age is for someone who has at least one child in this dataset.
insurance.query('children >= 1').age.mean()

39.78010471204188

In [93]:
# alternative age is for someone who has at least one child in this dataset.
sum_many, count_many = 0, 0

for i in range(len(insurance_dict['children'])):
    if int(insurance_dict['children'][i]) >= 1:
        sum_many += int(insurance_dict['age'][i])
        count_many += 1
print(f'average age for people with at least one child is {sum_many/count_many}')

average age for people with at least one child is 39.78010471204188


### Let's create class to perform simple analysis for values in dict (such as mean, median and std.dev)

In [84]:
class InsuranceAnalysis:
    
    def __init__(self, dic: dict): # setup column for analysis and type
        self.dic = dic
        
    def __repr__(self):
        return self.dic
    
    def perform_analysis(self, column: str, analysis_type: str):
        
        if column in self.dic.keys():
            self.column = column
        else:
            raise ValueError(f"No such columns in dict")
            
        if analysis_type in ['median', 'mean', 'std.dev']:
            self.analysis_type = analysis_type
        else:
            raise ValueError("Unknown type of analysis")
            
        try:
            dict_param = list(map(float, self.dic[column])) # to int
        except ValueError:
            if len(list(set(self.dic[column]))) == 2:
                first  = list(set(self.dic[column]))[0]
                second = list(set(self.dic[column]))[1]
                dict_param = [0 if i == first else 1 for i in self.dic[column]]
                print(f'{first}: 0, {second}: 1')
            else:
                raise ValueError("Cannot transfort str to int") 
        
        if analysis_type == 'mean':
            return sum(dict_param) / len(dict_param)
        
        elif analysis_type == 'median':
            dict_param = sorted(dict_param)
            if len(dict_param) % 2 == 0:
                return (dict_param[int(len(dict_param) / 2 - 1)] + dict_param[int(len(dict_param) / 2)]) / 2
            else:
                return dict_param[int(len(dict_param))]
        else:
            avg = sum(dict_param) / len(dict_param)
            return (sum([(i-avg)**2 for i in dict_param]) / (len(dict_param) - 1))**0.5

In [85]:
test = InsuranceAnalysis(insurance_dict)

In [95]:
test.perform_analysis('charges', 'std.dev')

12110.011236693994

In [94]:
insurance.charges.std()

12110.011236693994