# ML Linear Regression

In [26]:
import pandas as pd

# Load Train and Test with scaling
train_data = pd.read_csv("/workspaces/linear-regression-project-tutorial-pilarzarco/models/model_scal_train.csv")
test_data = pd.read_csv("/workspaces/linear-regression-project-tutorial-pilarzarco/models/model_scal_test.csv")

train_data.head()

Unnamed: 0,age,bmi,children,smoker,charges
0,0.108696,0.230024,0.0,1.0,0.020339
1,0.065217,0.26325,0.4,1.0,0.034446
2,0.73913,0.580172,0.4,1.0,0.516374
3,0.978261,0.686306,0.0,1.0,0.19607
4,0.630435,0.286252,0.4,1.0,0.137177


In [27]:
# Check dimensions of train and test
print(train_data.shape)
print(test_data.shape)

(1069, 5)
(268, 5)


In [28]:
X_train = train_data.drop(["charges"], axis = 1) # Eliminate the "charges" column to obtain the training characteristics.
y_train = train_data["charges"] # Obtain the "charges" column as the target training variable.

X_test = test_data.drop(["charges"], axis = 1) # Eliminate the "charges" column to obtain the test characteristics.
y_test = test_data["charges"] # Obtain the "charges" column as the target test variable.

In [29]:
# Create and adjust the model
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [30]:
# How the model fits your training data
print(f"Intercep (a): {model.intercept_}")
print(f"Coefficients (b1, b2): {model.coef_}")

Intercep (a): 0.3195827183308946
Coefficients (b1, b2): [ 0.1829699   0.18111814  0.04293572 -0.36780589]


In [31]:
# Predictions in the test set
y_pred = model.predict(X_test)
y_pred

array([ 0.10702315,  0.07685187,  0.2083241 ,  0.48685684,  0.12995459,
        0.19726353,  0.45755859,  0.00832554,  0.1556988 ,  0.16287349,
        0.14781084,  0.51037008,  0.4699569 ,  0.25944432,  0.14442757,
        0.13842465,  0.05134994,  0.49351746,  0.03848664,  0.0705325 ,
        0.04276279,  0.45855586,  0.22245537,  0.47182868,  0.47460549,
        0.06698449,  0.55116209,  0.5686984 ,  0.15588314,  0.20127518,
        0.0770878 ,  0.19051001, -0.00377298,  0.17967676,  0.61602286,
        0.18050297,  0.06028104,  0.0447116 ,  0.48287007,  0.13064786,
        0.08387671,  0.45610213,  0.54228697,  0.17124485,  0.09976616,
        0.03950023,  0.07028081,  0.12772953,  0.04874519,  0.13229562,
        0.09175793,  0.1653832 ,  0.47297483,  0.04437078,  0.16108797,
        0.14414371,  0.14902535,  0.02014774,  0.48426463,  0.13172362,
        0.2337337 ,  0.11691   ,  0.18103216,  0.00130779,  0.25281818,
        0.14612565,  0.14268127,  0.47671561,  0.38076578,  0.23

In [32]:
# Assess the quality of predictions
from sklearn.metrics import mean_squared_error, r2_score

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"R2 Score: {r2_score(y_test, y_pred)}")

MSE: 0.009150574110332004
R2 Score: 0.8045531086669286


In [33]:
# Root Mean Squared Error
import math
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse}")

RMSE: 0.09565863322425218


In [34]:
# Save model
from pickle import dump

dump(model, open("/workspaces/linear-regression-project-tutorial-pilarzarco/models/model_LR", "wb"))