In [1]:
import pandas as pd
import numpy as np

In [2]:
from scipy.stats import norm, skew, pearsonr
from scipy import stats

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
from plotly import tools
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected= True)

In [46]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.metrics import r2_score

In [9]:
import statsmodels.api as sm
from statsmodels.sandbox.regression.predstd import wls_prediction_std

In [10]:
df = pd.read_csv('insurance.csv')

In [11]:
df.shape

(1338, 7)

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [14]:
df.select_dtypes(['object']).columns

Index(['sex', 'smoker', 'region'], dtype='object')

In [16]:
obj_cats = ['sex', 'smoker', 'region']

for i in obj_cats:
    df[i] = df[i].astype('category')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       1338 non-null   int64   
 1   sex       1338 non-null   category
 2   bmi       1338 non-null   float64 
 3   children  1338 non-null   int64   
 4   smoker    1338 non-null   category
 5   region    1338 non-null   category
 6   charges   1338 non-null   float64 
dtypes: category(3), float64(2), int64(2)
memory usage: 46.2 KB


In [18]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
#Plotting Histogram using Plotly

In [23]:
charge_dist = df['charges'].values

logcharges = np.log(df['charges'])

trace0 = go.Histogram(
x= charge_dist,
histnorm= 'probability',
name= 'Charge Distribution',
marker= dict(color ='#FA5858')
)

trace1 = go.Histogram(
x= logcharges,
histnorm= 'probability',
name= 'Log Charge Distribution',
marker= dict(color ='#58FA82')
)

fig = make_subplots(rows=2, cols=1, subplot_titles='Charge Distribution', print_grid=False)

fig.append_trace(trace0, 1,1)
fig.append_trace(trace1, 2,1)

fig['layout'].update(showlegend=True, title='Charges Dist', bargap=0.05)

iplot(fig, filename='Custom-sized-subplot')



In [24]:
#creating new columns from the age

In [None]:
#plotting the pie chart using age categories

In [27]:
df['age_cat']  = np.nan

lst = [df]

for col in lst:
    col.loc[(col['age'] >= 18) & (col['age'] <= 35), 'age_cat'] = 'Young adult'
    col.loc[(col['age'] > 35) & (col['age'] <= 55), 'age_cat'] = 'Senior adult'
    col.loc[(col['age'] > 55), 'age_cat'] = 'Elder'
    
labels = df['age_cat'].unique().tolist()
amount = df['age_cat'].value_counts().tolist()

colors = ['#ff9999', '#b3d9ff', '#e6ffb3']

trace = go.Pie(labels= labels, 
               values= amount, 
               hoverinfo= 'label + percent',
               textinfo= 'value',
               textfont= dict(size=20),
               marker= dict(colors= colors, line=dict(color='#000000', width=2)))

data = [trace]
layout = go.Layout(title='Amount by age cat')

fig = go.Figure(data=data, layout=layout)
iplot(fig, filename='Basic Pie Chart')

In [None]:
#plotting heatmap for correlation

In [29]:
corr = df.corr()

hm = go.Heatmap(
z=corr.values,
x=corr.index.values.tolist(),
y=corr.index.values.tolist())

data = [hm]
layout = go.Layout(title='Correlation Heatmap')

fig = dict(data=data, layout=layout)

iplot(fig, filename='Heatmap')

In [None]:
#using label encoder, convert category to numeric value

In [31]:
le = LabelEncoder()
le.fit(df['sex'].drop_duplicates())
df['sex'] = le.transform(df['sex'])

In [35]:
le = LabelEncoder()
le.fit(df['region'].drop_duplicates())
df['region'] = le.transform(df['region'])

In [36]:
le = LabelEncoder()
le.fit(df['smoker'].drop_duplicates())
df['smoker'] = le.transform(df['smoker'])

In [33]:
#le.classes_

array(['female', 'male'], dtype=object)

In [34]:
#le.inverse_transform([0,1])

array(['female', 'male'], dtype=object)

In [37]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_cat
0,19,0,27.9,0,1,3,16884.924,Young adult
1,18,1,33.77,1,0,2,1725.5523,Young adult
2,28,1,33.0,3,0,2,4449.462,Young adult
3,33,1,22.705,0,0,1,21984.47061,Young adult
4,32,1,28.88,0,0,1,3866.8552,Young adult


In [39]:
X = df.drop(['charges','age_cat'], axis = 1)
y = df['charges']

In [40]:
quad = PolynomialFeatures(degree=2)
X_quad = quad.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X_quad, y, test_size=0.33, random_state=42)

In [49]:
plr = LinearRegression()

In [50]:
plr.fit(X_train,y_train)

LinearRegression()

In [51]:
y_pred = plr.predict(X_test)

In [52]:
r2_score(y_test, y_pred)

0.8548187870064636

In [54]:
pd.DataFrame(X_quad).columns

RangeIndex(start=0, stop=28, step=1)