# Factors affecting medical charges

Hey Everyone !! Let's explore the factors affecting medical charges and see if we find something interesting. 


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

Reading the dataset first and printing the first five rows

In [None]:
data=pd.read_csv("/kaggle/input/insurance/insurance.csv")
data.head()

Finding if there is any null values in the csv

In [None]:
data.isnull().sum()

So the dataset doesn't contain any null values. Now let's check the content of each column for outliers and datatypes. 

In [None]:
for col in data.columns:
    print(data[col].value_counts())

I am going to map columns which contains string data to int so that it will be easy to use Regression models on the dataset. 

In [None]:
smoker={
    'yes':1,
    'no':0
}

region={
    'southwest':1,
    'southeast':2,
    'northeast':3,
    'northwest':4
}
gender={
    'male':1,
    'female':2
}

data['smoker']=data['smoker'].map(smoker)
data['region']=data['region'].map(region)
data['sex']=data['sex'].map(gender)

I am also defining some colors which I am going to use in the graphs.

In [None]:
colors=["#31DFA2","#58D68D","#49c99e","#45B39D","#138D75","#167856"]

Next we will plot correlation graph to find interdependent features. We will use Spearman's correlation method. Spearman shows non linear dependenceies which Pearson can't.

In [None]:
spearman_corr=data.corr(method='spearman') # spearman for finding non linear dependencies
spearman_heatmap = sns.heatmap(spearman_corr)
plt.sca(spearman_heatmap)
plt.title("Spearman Non-Linear Correlation")

## Age & Smoker vs Charges ##  

From the above graph, we can see that age and smoker has a strong correlation with charges. Going to examine it by finding minimum, maximum and average charges for all age groups

In [None]:
# finding the distribution graph forage 
sns.distplot(data['age'], color = colors[0])

In [None]:
# finding unique values in age column 
age_slot=data['age'].unique()

# using two dictionaries one for smoker and anotther for non-smoker
smoker={}
nonsmoker={}

# collecting data about charges for each age value for both smoker and non-smoker
for slot in age_slot :
    
    # for smoker age vs charge with min , max and mean
    s_min_charge = data[(data['age']==slot) & (data['smoker']==1)]["charges"].min()
    s_max_charge = data[(data['age']==slot) & (data['smoker']==1)]["charges"].max()
    s_mean_charge = data[(data['age']==slot) & (data['smoker']==1)]["charges"].mean()

    # for non-smoker age vs charge with min , max and mean
    ns_min_charge = data[(data['age']==slot) & (data['smoker']==0)]["charges"].min()
    ns_max_charge = data[(data['age']==slot) & (data['smoker']==0)]["charges"].max()
    ns_mean_charge = data[(data['age']==slot) & (data['smoker']==0)]["charges"].mean()
    
    smoker[slot]=[s_min_charge,s_max_charge,s_mean_charge]
    nonsmoker[slot]=[ns_min_charge,ns_max_charge,ns_mean_charge]
    
# plotting graphs for both , total 6 graphs are plotted
fig, axs = plt.subplots(3, 2)
fig.set_size_inches(20, 15)

axs[0][0].bar(smoker.keys(),[ls[0] for ls in smoker.values()],color=colors[0])
axs[0][0].set_ylabel('Charges', fontsize=14)
axs[0][0].set_title('Smoker min Charge', fontsize=16)

axs[0][1].bar(smoker.keys(),[ls[0] for ls in nonsmoker.values()],color=colors[0])
axs[0][1].set_ylabel('Charges', fontsize=14)
axs[0][1].set_title('Non-Smoker min Charge', fontsize=16)

axs[1][0].bar(smoker.keys(),[ls[1] for ls in smoker.values()],color=colors[2])
axs[1][0].set_ylabel('Charges', fontsize=14)
axs[1][0].set_title('Smoker max Charge', fontsize=16)

axs[1][1].bar(smoker.keys(),[ls[1] for ls in nonsmoker.values()],color=colors[2])
axs[1][1].set_ylabel('Charges', fontsize=14)
axs[1][1].set_title('Non-Smoker max Charge', fontsize=16)

axs[2][0].bar(smoker.keys(),[ls[2] for ls in smoker.values()],color=colors[3])
axs[2][0].set_xlabel('Age', fontsize=14)
axs[2][0].set_ylabel('Charges', fontsize=14)
axs[2][0].set_title('Non-Smoker mean Charge', fontsize=16)

axs[2][1].bar(smoker.keys(),[ls[2] for ls in nonsmoker.values()],color=colors[3])
axs[2][1].set_xlabel('Age', fontsize=14)
axs[2][1].set_ylabel('Charges', fontsize=14)
axs[2][1].set_title('Non-Smoker mean Charge', fontsize=16)

fig.suptitle('Smoker vs Non-Smoker', fontsize=20)
plt.show()


**Conclusion :** As we can see that charges for smoker are much higher than non-smoker.

***

Apart from age and smoker, let's examine other columns too.

## Body Mass Index (BMI) vs Charges

Following data shows how weight categories are distributed over BMI :

1. BMI Less than 18.5 -> **Underweight**

2. BMI between 18.5 and 24.9 -> **Healthy weight**

3. BMI between 25 and 29.9 -> **Overweight**

4. BMI more than 30 -> **Obese**

In [None]:
underweight = data[(data['bmi']<=18.5)]["charges"]
healthy = data[(data['bmi']>=18.5) & (data['bmi']<=24.9)]["charges"]
overweight = data[(data['bmi']>=25) & (data['bmi']<=29.9)]["charges"]
obese = data[(data['bmi']>=30)]["charges"]

fig, axs = plt.subplots(2, 2)
fig.set_size_inches(20, 15)

sns.distplot(underweight , ax=axs[0][0] , color=colors[0])
axs[0][0].set_title('Underweight', fontsize=16)

sns.distplot(healthy , ax=axs[0][1] , color=colors[1])
axs[0][1].set_title('Healthy', fontsize=16)

sns.distplot(overweight , ax=axs[1][0] , color=colors[2])
axs[1][0].set_title('Overweight', fontsize=16)

sns.distplot(obese , ax=axs[1][1] , color=colors[4])
axs[1][1].set_title('Obese', fontsize=16)

fig.suptitle('BMI vs Charges', fontsize=20)
plt.show()

**Conclusion :** From the graph , its evident that obese people are paying more than other weight categories. These people have BMI more than 30.

***


## Number of children vs Charges


In [None]:
# children = data['children'].unique() # 6 unique values

fig, axs = plt.subplots(3, 2)
fig.set_size_inches(20, 15)

num_child=0

for i in range(3):
    
    for j in range(2):

        sns.distplot(data[ (data['children'] == num_child) ]["charges"], ax = axs[i][j] , color = colors[num_child] )
        axs[i][j].set_title('Number of children : '+ str(num_child), fontsize=16)
        num_child+= 1

fig.suptitle('Number of Children vs Charges', fontsize=20)
plt.show()

The disribution shows that , people with 5 children pays less than other people. Let's examine further it.

In [None]:
data[ (data['children'] == 5) ]

Big question .. How the hell a person can have 5 children at the age of 19 and 20 .. 

# 🤔🤨 # 

??? Guess it's a data entry error.

# Region vs Charges

In [None]:
#region = data['region'].unique() # 4 unique values

region_string = ['Southwest', 'Southeast', 'Northeast', 'Northwest']

fig, axs = plt.subplots(2, 2)
fig.set_size_inches(20, 15)

region=1

for i in range(2):
    
    for j in range(2):

        sns.distplot(data[ (data['region'] == region) ]["charges"], ax = axs[i][j] , color = colors[region] )
        axs[i][j].set_title('Region : '+ str(region_string[region-1]), fontsize=16)
        region+= 1

fig.suptitle('Region vs Charges', fontsize=20)
plt.show()

** Conclusion : ** All the regions are asking for almost same charges

# Selecting regression model for prediction

In [None]:
# importing all the necessary libraries

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_classification

Distributing the dependent and interdependent attributes 

In [None]:
X = data.drop(['charges'], axis = 1) # independent attributes
Y = data.charges # target or dependent attribute

x_train, x_test, y_train, y_test = train_test_split(X,Y)

Trying all the models to determine the best regression model.

In [None]:
# linear model
linear=LinearRegression()
linear.fit(x_train, y_train)

# decision tree regressor
decision_tree = DecisionTreeRegressor(random_state=0)
decision_tree.fit(x_train, y_train)

# random forest regressor
random_forest = RandomForestRegressor(max_depth=2, random_state=0)
random_forest.fit(x_train, y_train)

# printing all the prediction results
print("Linear Regressor Score : ", linear.score(x_test,y_test))
print("Decision Tree Regressor Score : ", decision_tree.score(x_test,y_test))
print("Random Forest Regressor : ", random_forest.score(x_test,y_test))


Adding polynomial features to increaese the accuracy for linear model

In [None]:
# polynomial features for regression
quad = PolynomialFeatures (degree = 2)
x_quad = quad.fit_transform(X)
X_train,X_test,Y_train,Y_test = train_test_split(x_quad,Y, random_state = 0)
poly_linear = LinearRegression().fit(X_train,Y_train)

print("Polynomial Linear Regressor : ", poly_linear.score(X_test,Y_test))

Seems like polynomial features with linear regression gives the best accuracy 88.49 %. Thank you !! 

# 😀😁