In [None]:
medical_charge_url = 'https://raw.githubusercontent.com/JovianML/opendatasets/master/data/medical-charges.csv'

In [None]:
from urllib.request import urlretrieve
import sys
sys.executable

In [None]:
#Downloading dataset
urlretrieve(medical_charge_url, 'medical-charge.csv')

In [None]:
import numpy as np, pandas as pd, seaborn as sns

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

#Make sure charts display inside notebook
%matplotlib inline

In [None]:
#OneHotEncoder for categorial data
from sklearn.preprocessing import OneHotEncoder

#Using Linear Regression model for continuous data
from sklearn.linear_model import LinearRegression
model = LinearRegression()

#Example (Single/Multiple Feature)
# feature = dataFrame[[feature 1, feature 2 ,....feature n]]
# target = dataFrame[target]
# model = Linear Regression().fit(feature ,target)
# predictions = model.predict(feature)

# loss = rmse(target, predictions)
# display loss

# from sklearn.model_selection import train_test_split as tts
# X_train, X_test, Y_train, Y_test = tts(arrays, test size, train size)

In [None]:
medical_charges = pd.read_csv('medical-charge.csv')

In [None]:
medical_charges.describe()

In [None]:
medical_charges.age.describe()

In [None]:
medical_charges.head()

In [None]:
medical_charges

In [None]:
medical_charges.info()

In [None]:
medical_charges['charges'].corr(medical_charges['age'])

In [None]:
fig = px.histogram(medical_charges,
                   x = 'bmi',
                   marginal='box',
                   title = "BMI")
fig.update_layout(bargap = 0.1)
fig.show()

In [None]:
fig = px.histogram(medical_charges,
                   x = "children",
                   marginal='box',
                   color = "smoker",
                   title = "Childrens")
fig.update_layout(bargap = 0.1)
fig.show()

In [None]:
fig = px.histogram(medical_charges,
                   x = "charges",
                   marginal='box',
                   color = "smoker",
                   title = "Charges")
fig.update_layout(bargap = 0.1)
fig.show()

In [None]:
fig = px.histogram(medical_charges,
                   x = "smoker",
                   marginal='box',
                   color = "sex",
                   title = "Charges")
fig.update_layout(bargap = 0.1)
fig.show()

In [None]:
fig = px.histogram(medical_charges,
                   x = 'age',
                   marginal='box',
                   nbins=47,
                   title="Age_hist")
fig.update_layout(bargap = 0.1)
fig.show()

In [None]:
fig = px.scatter(medical_charges,
                 x = 'age',
                 y = 'charges',
                 hover_data = 'sex',
                 opacity=0.8,
                 color = 'smoker',
                 title= 'Age against charges')
fig.show()

In [None]:
fig = px.scatter(medical_charges,
                 x = 'bmi',
                 y = 'charges',
                 hover_data = 'sex',
                 opacity=0.8,
                 color = 'smoker',
                 title= 'Age against charges')
fig.show()

In [None]:
medical_charges.smoker = medical_charges.smoker.map({'yes': 1, 'no': 0})
#smoker = medical_charges.map({'smoker' : {'yes':1, 'no' : 0}})
medical_charges.sex = medical_charges.sex.map({
                "male":1, 
                "female":0})

In [None]:
#Encoding categorial region column as region_northwest, region_southeast, etc.
# ohe = OneHotEncoder(sparse_output = False)
# region_enc = ohe.fit_transform(medical_charges[['region']])

# region_df = pd.DataFrame(region_enc, columns = ohe.get_feature_names_out(None))
# region_df
# medical_charges.region = medical_charges.region.map({
#                 "northeast":0, 
#                 "northeast":1,
#                 "northeast":2,
#                 "northeast":3})

In [None]:
#Variables for frequently used columns
total_target = medical_charges.charges
total_age = medical_charges.age
total_bmi = medical_charges.bmi
total_children = medical_charges.children
total_smoker = medical_charges.smoker
total_sex = medical_charges.sex
total_region = medical_charges.region

In [None]:
non_smoker_df = medical_charges[medical_charges.smoker == 0]
target = non_smoker_df.charges
age = non_smoker_df.age
bmi = non_smoker_df.bmi
children = non_smoker_df.children
sex = non_smoker_df.sex
region = non_smoker_df.region

In [None]:
# X_train, X_test, Y_train, Y_test = 

In [None]:
target.corr(age)
#corr_matrix = non_smoker_df.select_dtypes(include="number").corr()

In [None]:
#Heatmaps of only numeric columns
sns.heatmap(non_smoker_df.drop(columns=["smoker", "region"]).corr(),cmap='Reds', annot=True)
plt.title("Heatmap")

In [None]:
sns.scatterplot(non_smoker_df, x = 'age', y = 'charges', s = 15, alpha = 0.7)
plt.show()


In [None]:
def predict_charges(age, weight, bias):
    return weight * age + bias
target

In [None]:
w = 250
b = 300
estimate = predict_charges(age, w, b)
estimate

In [None]:
plt.plot(age, estimate, 'r')
plt.scatter(age, target, alpha = 0.8)

In [None]:
def try_prediction(weight, bias):
    prediction = weight * age + bias
    plt.plot(age, prediction, 'r')
    plt.scatter(age, target, alpha = 0.8)

In [None]:
try_prediction(250,300)

In [None]:
#We often track pattern by observing trends however the machine doesn't understand visually
#So we tell the machine how to optimize model in terms of mathematical error
#In Linear Regression, We use MSE(Mean Square Error) for machine learning and RMSE(Root Mean Square Error) for testing or reporting
def rmse(target, prediction):
    return np.sqrt(np.mean(np.square(target-prediction)))

In [None]:
w = 50
b = 100

In [None]:
try_prediction(w,b)

In [None]:
prediction = predict_charges(age, w, b)
rmse(target, prediction)

In [None]:
def try_prediction(weight, bias):
    prediction = weight * age + bias
    plt.plot(age, prediction, 'r')
    plt.scatter(age, target, alpha = 0.8)
    loss = rmse(target, prediction)
    print("RMSE is: ", loss)

In [None]:
try_prediction(50,100)

In [None]:
try_prediction(200,100)

In [None]:
try_prediction(250,500)

In [None]:
try_prediction(250,-2000)

In [None]:
try_prediction(260,-2000)

In [None]:
try_prediction(270,-2000)

In [None]:
try_prediction(269,-2000)

In [None]:
try_prediction(267,-2000)

In [None]:
try_prediction(265,-2000)

In [None]:
#Single Feature Linear Regression Machine Learning Model

In [None]:
#We have somewhat found the optimum error by trial and error 
#Around 4662$ of loss for now
#Now let's use the linear regression model from scikit-learn library to train a model
help(model.fit)
#Here, as we can see the x-axis takes input of 2D array (The weight)
#While the y-axis takes input of simple array (The real target)

In [None]:
input = non_smoker_df[['age']]
type(input)

In [None]:
model.fit(input, target)

In [None]:
#Now we have successfully fitted data into the regression model
#It's time to use model for prediction
model.predict(np.array([[23],
                       [37],
                       [61]]
                    ))

In [None]:
#Let's check the rmse of the trained model now
prediction = model.predict(input)
rmse(target, prediction)

In [None]:
#These are the optimized weight and intercept found by the model
model_w = model.coef_
model_b = model.intercept_

In [None]:
try_prediction(model_w, model_b)

In [None]:
model_w, model_b

In [None]:
prediction, target

In [None]:
#Here we have successfully trained our first linear regression model for a single feature

In [None]:
#Multiple Feature Linear Regression Machine Learning Model

In [None]:
inputs = non_smoker_df[['age', 'bmi', 'children', 'sex']]
target = non_smoker_df['charges']

In [None]:
model.fit(inputs, target)
prediction = model.predict(inputs)
prediction

In [None]:
non_smoker_df

In [None]:
rmse(prediction, target)

In [None]:
def try_prediction(w1,w2,w3,w4, bias):
    # w1,w2,w3,w4 = w
    prediction = w1*age + w2*bmi + w3*sex + w4*children + bias
    plt.plot(age, prediction, 'r')
    plt.scatter(age, target, alpha = 0.8)
    loss = rmse(target, prediction)
    print("RMSE is: ", loss)
model.coef_, model.intercept_

In [None]:
w1, w2, w3, w4 = model.coef_
try_prediction(w1,w2,w3,w4, model.intercept_)