In [None]:
import pandas as pd
import numpy as np

In [None]:
import plotly.express as px
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
med_df = pd.read_csv('https://github.com/S-Sanjai/ML-fundamentals/blob/main/Projects/medical-insurance/data/medical.csv?raw=true')

In [None]:
smoker_num = med_df['smoker'].apply(lambda x: 1 if x == 'yes' else 0)
med_df['smoker_num'] = smoker_num

In [None]:
def rmse(target_val, predicted_val):
    return np.sqrt(np.mean(np.square(target_val - predicted_val)))

In [None]:
model = LinearRegression()

In [None]:
non_smoker_df = med_df[med_df.smoker == 'no']
inputs = non_smoker_df[['age']]
targets = non_smoker_df.charges
print('inputs.shape :', inputs.shape)
print('targets.shape :', targets.shape)

In [None]:
model.fit(inputs, targets)


In [None]:
model.predict(np.array([[23],[40],[70]]))

In [None]:
predictions = model.predict(inputs)

In [None]:
print(rmse(targets, predictions))
print(model.coef_,model.intercept_)

In [None]:
sns.set_style('darkgrid')
plt.style.use('dark_background')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
def try_parameters(w, b):
    ages = non_smoker_df.age
    target = non_smoker_df.charges
    
    plt.plot(ages, predictions, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);
    
    loss = rmse(target, predictions)
    print("RMSE Loss: ", loss)
    
try_parameters(model.coef_,model.intercept_)

In [None]:
smokers_df = med_df[med_df.smoker == 'yes']
inputs_smokers = smokers_df[['age']]
targets_smokers = smokers_df.charges

In [None]:
model.fit(inputs_smokers,targets_smokers)

In [None]:
predictions = model.predict(inputs_smokers)

In [None]:
rmse(targets_smokers, predictions)

In [None]:
model.coef_, model.intercept_

In [None]:
def try_parameters(w, b):
    ages = smokers_df.age
    target = smokers_df.charges
    
    plt.plot(ages, predictions, 'r', alpha=0.9);
    plt.scatter(ages, target, s=8,alpha=0.8);
    plt.xlabel('Age');
    plt.ylabel('Charges')
    plt.legend(['Estimate', 'Actual']);
    
    loss = rmse(target, predictions)
    print("RMSE Loss: ", loss)
    
try_parameters(model.coef_,model.intercept_)

<b> Linear Regression Using Multiple Variables: </b> <br>
so now we going to add another parameter to our model on top of the existing one to see how good or bad our model performs. In this case we are going to add 'BMI' as a parameter to the model. <br>
<b>new relationsip with charges:</b> <br>
<i>charges = w1 X age + w2 X bmi + b</i>

In [None]:
inputs = non_smoker_df[['age', 'bmi']]
targets = non_smoker_df.charges

model.fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss: ", loss)

In [None]:
non_smoker_df.charges.corr(non_smoker_df.age), non_smoker_df.charges.corr(non_smoker_df.bmi)

In [None]:
model.coef_, model.intercept_

So as we can see the change in RMSE value (~$0.20), BMI is not a strong variable to predcit the charges especially with the non-smokers. 'bmi' also has a very weak coorelation with charges<b>(0.0840)</b> as compared to 'age' <b>(0.6279)</b>.<br> and also the weight of 'bmi' <i>W2</i> is very less compared to <i>W1</i>.

In [None]:
model = LinearRegression().fit(non_smoker_df[['bmi']], non_smoker_df.charges)
predictions = model.predict(non_smoker_df[['bmi']])
loss = rmse(non_smoker_df.charges, predictions)
print("RMSE Loss: ", loss)

so here as we can see the RMSE loss (5969.7725) is higher than the previous one(4662.5057), which means that the model is not as good as the previous one.

In [None]:
fig = px.scatter_3d(non_smoker_df, x='age', y='bmi', z='charges')
fig.update_traces(marker_size=3, marker_opacity=0.5)
fig.show()

Now we will add another feature, `children`, to our model:

$charges = w_1 \times age + w_2 \times bmi + w_3 \times children + b$

In [None]:
non_smoker_df.charges.corr(non_smoker_df.children)

In [None]:
graph = px.strip(non_smoker_df, x='children', y='charges', color='smoker')
graph.show()

as we can see here there is somewhat linear relationship between the number of children and the charges, but it is not very strong.

In [None]:
inputs, targets = non_smoker_df[['age', 'bmi', 'children']], non_smoker_df.charges

model = LinearRegression().fit(inputs, targets)

predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss: ", loss)

as we can see there is a not so strong correlation between the number of children and the charges, but it is still little higher than the correlation between the bmi and the charges.

<b>We will try the model on the smokers dataset as well</b>

In [None]:
inputs_smokers, targets_smokers = smokers_df[['age', 'bmi', 'children']], smokers_df.charges

model_smokers = LinearRegression().fit(inputs_smokers, targets_smokers)
predictions_smokers = model_smokers.predict(inputs_smokers)

loss_smokers = rmse(targets_smokers, predictions_smokers)
print("RMSE Loss Smokers: ", loss_smokers)

So as we can see, the model for smokers is worse than the one for non-smokers.
But when we see the coorelation with the variables, we can say that the bmi and and age are the reasons for wich there are two cluseters in the graph

In [None]:
smokers_df.charges.corr(smokers_df.children), smokers_df.charges.corr(smokers_df.age), smokers_df.charges.corr(smokers_df.bmi)

In [None]:
graph = px.scatter_3d(smokers_df, x='age', y='bmi', z='charges')
graph.update_traces(marker_size=3, marker_opacity=0.5)
graph.show()

Even though the model has a lower RMSE for non-smokers, the model for smokers has a higher RMSE. This is because the charges for smokers are generally higher and more variable than those for non-smokers, leading to a larger error in predictions.

Now lets use a single model for the entire dataset, including both smokers and non-smokers.

In [None]:
inputs, targets = med_df[['age', 'bmi', 'children']], med_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss All: ", loss)

So we can see that the model is not performing well on the smokers data, and the overall model is also not performing well. This indicates that there are likely other factors influencing the charges that are not captured by the current features.

<b>Using Categorical Features for Machine Learning</b><br>
So far we've been using only numeric columns, since we can only perform computations with numbers. If we could use categorical columns like "smoker", we can train a single model for the entire dataset.
<br>
To use the categorical columns, we simply need to convert them to numbers. There are three common techniques for doing this:
<br><b>1. </b>
<i>If a categorical column has just two categories (it's called a binary category), then we can replace their values with 0 and 1.
<br><b>2. </b>
If a categorical column has more than 2 categories, we can perform one-hot encoding i.e. create a new column for each category with 1s and 0s.
<br><b>3. </b>
If the categories have a natural order (e.g. cold, neutral, warm, hot), then they can be converted to numbers (e.g. 1, 2, 3, 4) preserving the order. These are called ordinals</i>

<b>Binary categories:</b> <br>
<br>
The "smoker" column is binary, so we can use it as a feature in our model.<br>
We can convert it to a numerical feature by mapping 'yes' to 1 and 'no' to 0.

In [None]:
smoker_num

In [None]:
med_df

In [None]:
med_df.charges.corr(med_df.smoker_num)

We can now use the `smoker_num` column for linear regression.

$charges = w_1 \times age + w_2 \times bmi + w_3 \times children + w_4 \times smoker + b$

In [None]:
inputs = med_df[['age', 'bmi', 'children', 'smoker_num']]
targets = med_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss All with Smoker: ", loss)

The loss reduces from `11355` to `6056`, almost by 50%! This is an important lesson: never ignore categorical data.


Let's try adding the "sex" column as well.

$charges = w_1 \times age + w_2 \times bmi + w_3 \times charges + w_4 \times smoker + w_5 \times sex + b$

In [None]:
sex_code = {'female': 0, 'male': 1}
med_df['sex_code'] = med_df.sex.map(sex_code)
med_df.charges.corr(med_df.sex_code)

so there is no strong correlation between '`sex`' and '`charges`'

In [None]:
inputs = med_df[['age', 'bmi', 'children', 'smoker_num', 'sex_code']]
targets = med_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss", loss)

As we expeted, the reduction in `RMSE` is not significant, but the model is now more complex and can be used to predict charges for both `smokers` and `non-smokers`.


### One-hot Encoding

The "region" column contains 4 values, so we'll need to use hot encoding and create a new column for each region.

![](https://i.imgur.com/n8GuiOO.png)

In [None]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder()
enc.fit(med_df[['region']])
enc.categories_

In [None]:
onehot = enc.transform(med_df[['region']]).toarray()
onehot

In [None]:
med_df[['region_northeast', 'region_southeast', 'region_southwest', 'region_northwest']] = onehot
med_df

Let's include the region columns into our linear regression model.

$charges = w_1 \times age + w_2 \times bmi + w_3 \times children + w_4 \times smoker + w_5 \times sex + w_6 \times region + b$

In [None]:
input_cols = ['age', 'bmi', 'children', 'smoker_num', 'sex_code','region_northeast', 'region_southeast', 'region_southwest', 'region_northwest']
inputs, targets = med_df[input_cols], med_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss with OneHot Encoding: ", loss)

Once again, the RMSE loss is not significantly improved by adding the region feature

In [None]:
print(model.predict(np.array([[23, 22, 0, 0, 0, 0, 1, 0, 0]])))
print(model.predict(np.array([[40, 30, 2, 1, 1, 0, 0, 0, 1]])))

## Model Improvements

Let's discuss and apply some more improvements to our model.

### Feature Scaling

Recall that due to regulatory requirements, we also need to explain the rationale behind the predictions our model. 

$charges = w_1 \times age + w_2 \times bmi + w_3 \times children + w_4 \times smoker + w_5 \times sex + w_6 \times region + b$

To compare the importance of each feature in the model, our first instinct might be to compare their weights. 

In [None]:
model.coef_, model.intercept_

#### To visualize the Coefficients better we will but them in a Pandas Dataframe

In [None]:
weights = pd.DataFrame({'feature': np.append(input_cols,'Intercept'), 'weight': np.append(model.coef_, model.intercept_)})
weights

While it seems like BMI and the "northeast" have a higher weight than age, keep in mind that the range of values for BMI is limited (15 to 40) and the "northeast" column only takes the values 0 and 1.

Because different columns have different ranges, we run into two issues:

1. We can't compare the weights of different column to identify which features are important
2. A column with a larger range of inputs may disproportionately affect the loss and dominate the optimization process.

For this reason, it's common practice to scale (or standardize) the values in numeric column by subtracting the mean and dividing by the standard deviation.

![](https://i.imgur.com/dT5fLFI.png)

We can apply scaling using the StandardScaler class from `scikit-learn`.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
num_cols = ['age', 'bmi', 'children']
scaler = StandardScaler()
scaler.fit(med_df[num_cols])

In [None]:
scaler.mean_, scaler.var_

In [None]:
scaled_inputs = scaler.transform(med_df[num_cols])
scaled_inputs

In [None]:
cat_cols = ['smoker_num', 'sex_code', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest']
categorical_data = med_df[cat_cols].values
categorical_data

In [None]:
inputs = np.concatenate((scaled_inputs, categorical_data), axis=1)
targets = med_df.charges

model = LinearRegression().fit(inputs, targets)
predictions = model.predict(inputs)

loss = rmse(targets, predictions)
print("RMSE Loss with StandardScaler: ", loss)

We can now compare the weights in the formula:

$charges = w_1 \times age + w_2 \times bmi + w_3 \times children + w_4 \times smoker + w_5 \times sex + w_6 \times region + b$

In [None]:
weights = pd.DataFrame(model.coef_, index=input_cols, columns=['weight'])
weights.sort_values(by='weight', ascending=False, inplace=True)
weights

As you can see now, the most important feature are:

1. `Smoker`
2. `Age`
3. `BMI`

In [None]:
print(model.predict(np.array([[23, 22, 0, 0, 0, 0, 1, 0, 0]])))
print(model.predict(np.array([[40, 30, 2, 1, 1, 0, 0, 0, 1]])))

##### This says that standardization did not improve the model performance, but it optimizes the dataset to understand the relative importance of each feature better.

## Creating a Test Set

##### Models like the one we've created in this tutorial are designed to be used in the real world. It's common practice to set aside a small fraction of the data (e.g. 10%) just for testing and reporting the results of the model.

In [None]:
from sklearn.model_selection import train_test_split
train_inputs, test_inputs, train_targets, test_targets = train_test_split(inputs, targets, test_size=0.2, random_state=42, shuffle=True)

In [None]:
help(train_test_split)

In [None]:
model = LinearRegression().fit(train_inputs, train_targets)
predictions_test = model.predict(test_inputs)

loss = rmse(test_targets, predictions_test)
print("RMSE Loss on Test Set: ", loss)

In [None]:
predictions_train = model.predict(train_inputs)
train_loss = rmse(train_targets, predictions_train)
print("RMSE Loss on Train Set: ", train_loss)