In [2]:
import pandas as pd
import numpy as np

In [15]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
med_df = pd.read_csv('https://github.com/S-Sanjai/ML-fundamentals/blob/main/Projects/medical-insurance/data/medical.csv')

In [17]:
pastel_red = 'rgba(255, 99, 132, 0.6)'
pastel_green = 'rgba(144, 238, 144, 0.6)'
Pastel_yellow ='rgba(255, 223, 102, 0.6)'
Pastel_blue = 'rgba(173, 216, 230, 0.6)'	
Pastel_purple = 'rgba(218, 112, 214, 0.6)'	
Pastel_orange = 'rgba(255, 179, 102, 0.6)'	
Pastel_gray = 'rgba(200, 200, 200, 0.6)'

In [18]:
med_df.info()

In [19]:
med_df.describe()

In [20]:
sns.set_style('darkgrid')
plt.style.use('dark_background')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [21]:
med_df.age.describe()

<b>To see the AGE distribution</b>

In [22]:
graph = px.histogram(med_df, x='age', 
                     marginal='box', 
                     nbins=47, 
                     title='<b>Distribution of Age</b>')
graph.update_layout(bargap=0.1,template='plotly_dark')
graph.show()


<b>To see the BMI distribution</b>

In [23]:
graph = px.histogram(med_df, x='bmi', 
                     marginal='box', 
                     color_discrete_sequence=[pastel_red], 
                     title='<b>Distribution of BMI</b>')
graph.update_layout(bargap=0.1, template='plotly_dark')
graph.show()


In [24]:
graph = px.histogram(med_df,
                     x='charges',
                     marginal='box', 
                     color_discrete_sequence=[pastel_green],
                     title='<b>Annual Medical Charges</b>'
                     )
graph.update_layout(bargap=0.1, template='plotly_dark')
graph.show()

<b>To see the Annual Medical Charges distribution (Split:Smokers)</b>

In [25]:
graph = px.histogram(med_df,
                     x='charges',
                     marginal='box',
                     color='smoker',
                     color_discrete_sequence=[pastel_red, pastel_green],
                     title='<b>Annual Medical Charges</b>'
                     )
graph.update_layout(bargap=0.1, template='plotly_dark')
graph.show()

<b>EXERCISE</b>

In [26]:
graph = px.histogram(med_df,
                     x='charges',
                     marginal='box',
                     color='sex',
                     color_discrete_sequence=[pastel_red, Pastel_blue],
                     title='<b>Annual Medical Charges (Split:Sex)</b>'
                     )
graph.update_layout(bargap=0.1, template='plotly_dark')
graph.show()

In [27]:
graph = px.histogram(med_df,
                     x='charges',
                     marginal='box',
                     color='region',
                     color_discrete_sequence=[pastel_red, pastel_green, Pastel_yellow, Pastel_purple],
                     title='<b>Annual Medical Charges (Split:Region)</b>'
                     ) 
graph.update_layout(bargap=0.1, template='plotly_dark')
graph.show()

SMOKERS

In [28]:
med_df.smoker.value_counts()

In [29]:
px.histogram(med_df, 
             x='smoker', 
             color='sex', 
             title= '<b>SMOKERS</b>', 
             template='plotly_dark',
             color_discrete_sequence=[pastel_red, Pastel_blue])
             

<b>Charges vs Other Variables</b>

In [30]:
graph = px.scatter(med_df,
                   x='age',
                   y='charges',
                   color='smoker',
                   color_discrete_sequence=[pastel_red, pastel_green],
                   opacity=0.8,
                   hover_data=['sex'],
                   title='<b>Age vs Charges</b>',
                   template='plotly_dark')
graph.update_traces(marker_size=5)
graph.show()

In [31]:
graph = px.scatter(med_df,
                   x='bmi',
                   y='charges',
                   color='smoker',
                   color_discrete_sequence=[pastel_red, pastel_green],
                   opacity=0.8,
                   hover_data=['sex'],
                   title='<b>BMI vs Charges</b>',
                   template='plotly_dark')
graph.update_traces(marker_size=5)
graph.show()

In [32]:
graph = px.violin(med_df,
                   x='sex',
                   y='charges',
                   color='smoker',
                   color_discrete_sequence=[pastel_red, pastel_green],
                   hover_data=['sex'],
                   title='<b>Sex vs Charges</b>',
                   template='plotly_dark')
graph.update_traces(marker_size=5)
graph.show()

In [33]:
graph = px.violin(med_df,
                   x='smoker',
                   y='charges',
                   color='smoker',
                   color_discrete_sequence=[pastel_red, pastel_green],
                   hover_data=['sex'],
                   title='<b>Smoker vs Charges</b>',
                   template='plotly_dark')
graph.update_traces(marker_size=5)
graph.show()

In [34]:
graph = px.histogram(med_df,
                   x='region',
                #    y='smoker',
                   color='smoker',
                   color_discrete_sequence=[pastel_red, pastel_green],
                   opacity=0.8,
                   hover_data=['sex'],
                   title='<b>BMI vs Charges</b>',
                   template='plotly_dark')
graph.show()

<B>CORRELATION</B>

In [35]:
med_df.charges.corr(med_df.age)

In [36]:
med_df.charges.corr(med_df.bmi)

In [37]:
smoker_val = {'no' : 0, 'yes' : 1}
smoker_num = med_df.smoker.map(smoker_val)
med_df.charges.corr(smoker_num)

In [38]:
med_df['smoker_num'] = med_df['smoker'].map({'no':0, 'yes': 1})
med_df_numeric = med_df.select_dtypes(include='number')
med_df_numeric.corr()


In [39]:
sns.heatmap(med_df_numeric.corr(), cmap='Reds', annot=True)
plt.title('Correlation Matrix')

<b>Linear Regression using a Single Feature:</b>
We know that the variables 'smokers' and 'age' have the strongest correlation with 'charges'. To get the Linear regression we need to first estimate 'charges' against 'age' but with non-smokers first, and later with smokers.

In [40]:
non_smoker_df = med_df[med_df.smoker == 'no']
plt.style.use('dark_background')
plt.title('Age vs Charges (non-smokers)')
sns.scatterplot(data=non_smoker_df, x='age', y='charges')

In [41]:
def estimate_charges(age, w, b):
    return w * age + b

In [42]:
ages = med_df[med_df.smoker == 'no'].age
ages

In [43]:
w = 50
b = 100 #assume the weight and base
estimate_charges(ages, w, b)

In [44]:
non_smoker_df.charges

In [45]:
def rmse(t, p):
    return np.sqrt(np.mean(np.square(t - p)))

In [46]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    
    estimated_charges = estimate_charges(ages, w, b)
    
    plt.plot(ages, estimated_charges, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);

In [47]:
try_parameters(290, -3300)

In [48]:
w = 290
b = -3300
targets = non_smoker_df['charges']
predicted = estimate_charges(non_smoker_df.age, w, b)

In [49]:
rmse(targets, predicted)

In [50]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    
    predictions = estimate_charges(ages, w, b)
    
    plt.plot(ages, predictions, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);
    
    loss = rmse(target, predictions)
    print("RMSE Loss: ", loss)
    