# Medical Cost Project
***
***

# Goals
***
- Draw insights about what drives medical charges
- Create a machine learning model that can effectively predict medical costs

# Setup
***

In [43]:
# establishing environment
import sklearn
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Acquire
***

In [51]:
# acquiring data from local csv
df = pd.read_csv('insurance.csv')

In [52]:
# previewing df
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [53]:
# checking for null, data types, etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


__The following changes will need to be made to the data__


- Scale non-target variable numerical columns
    - age
    - bmi
    - children


- Create boolean columns for categorical variables
    - sex
    - smoker
    - region
    
    
- No nulls to address


- Data types are appropriate for the operations I intend to perform on them


- Split data in train, validate, test sets

### Creating boolean columns for categorical variables

In [55]:
# creating dummy columns
df_dummies = pd.get_dummies(df, columns = ['sex', 'region' , 'smoker'], prefix = ['is','region', 'smoker'])

# dropping these columns since their information is stored in counterpart column since they are binary values in this case
# ie. only male and female in data, if patient not female, must be male
# ie. if patient is a smoker, they must not be a non-smoker
df_dummies = df_dummies.drop(columns = ['is_male', 'smoker_no'])

# renaming smoker column to match other binary column
df_dummies.rename(columns = {'smoker_yes' : 'is_smoker'}, inplace=True)

# resetting df indices otherwise merge causes extra rows to be created
df_dummies.reset_index(inplace = True, drop = True)
df.reset_index(inplace = True, drop = True)

# merging dummy df with main df
df = df.merge(df_dummies, how='inner')

df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,is_female,region_northeast,region_northwest,region_southeast,region_southwest,is_smoker
0,19,female,27.900,0,yes,southwest,16884.92400,1,0,0,0,1,1
1,18,male,33.770,1,no,southeast,1725.55230,0,0,0,1,0,0
2,28,male,33.000,3,no,southeast,4449.46200,0,0,0,1,0,0
3,33,male,22.705,0,no,northwest,21984.47061,0,0,1,0,0,0
4,32,male,28.880,0,no,northwest,3866.85520,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,50,male,30.970,3,no,northwest,10600.54830,0,0,1,0,0,0
1336,18,female,31.920,0,no,northeast,2205.98080,1,1,0,0,0,0
1337,18,female,36.850,0,no,southeast,1629.83350,1,0,0,1,0,0
1338,21,female,25.800,0,no,southwest,2007.94500,1,0,0,0,1,0


### Scale appropriate numerical columns

In [56]:
# creating scaler object
scaler = sklearn.preprocessing.MinMaxScaler()

# fitting scaler to various columns and adding scaled versions of each to DF
df['age_s'] = scaler.fit_transform(df[['age']])
df['bmi_s'] = scaler.fit_transform(df[['bmi']])
df['children_s'] = scaler.fit_transform(df[['children']])

In [57]:
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,is_female,region_northeast,region_northwest,region_southeast,region_southwest,is_smoker,age_s,bmi_s,children_s
0,19,female,27.900,0,yes,southwest,16884.92400,1,0,0,0,1,1,0.021739,0.321227,0.0
1,18,male,33.770,1,no,southeast,1725.55230,0,0,0,1,0,0,0.000000,0.479150,0.2
2,28,male,33.000,3,no,southeast,4449.46200,0,0,0,1,0,0,0.217391,0.458434,0.6
3,33,male,22.705,0,no,northwest,21984.47061,0,0,1,0,0,0,0.326087,0.181464,0.0
4,32,male,28.880,0,no,northwest,3866.85520,0,0,1,0,0,0,0.304348,0.347592,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,50,male,30.970,3,no,northwest,10600.54830,0,0,1,0,0,0,0.695652,0.403820,0.6
1336,18,female,31.920,0,no,northeast,2205.98080,1,1,0,0,0,0,0.000000,0.429379,0.0
1337,18,female,36.850,0,no,southeast,1629.83350,1,0,0,1,0,0,0.000000,0.562012,0.0
1338,21,female,25.800,0,no,southwest,2007.94500,1,0,0,0,1,0,0.065217,0.264730,0.0
