In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("/data/insurance.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
target = "charges"

In [6]:
y = df[target]

In [7]:
X = df.drop(columns=target)

In [10]:
X.isna().sum()

age         0
gender      0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [11]:
X.head()

Unnamed: 0,age,gender,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


In [17]:
# One Hot Encoding

X_dummy = pd.get_dummies(X, drop_first=True)
X_dummy.head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,32,28.88,0,1,0,1,0,0


In [29]:
features = X_dummy.columns
features

Index(['age', 'bmi', 'children', 'gender_male', 'smoker_yes',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [18]:
from sklearn import model_selection

In [19]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X_dummy, y, test_size = 0.3, random_state = 1)

In [20]:
X_train

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
744,50,26.410,0,1,0,1,0,0
363,21,26.400,1,0,0,0,0,1
10,25,26.220,0,1,0,0,0,0
970,50,28.160,3,0,0,0,1,0
634,51,39.700,1,1,0,0,0,1
554,25,41.325,0,0,0,0,0,0
314,27,31.400,0,0,1,0,0,1
419,63,26.980,0,0,1,1,0,0
525,18,33.880,0,0,0,0,1,0
1041,18,23.085,0,1,0,0,0,0


In [22]:
# Z tranformation = (x - x_mean)/x_std, for each feature x
# In output, each feature will have ~ 0 mean, ~ 1 std. 

In [23]:
from sklearn import preprocessing

In [24]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

In [30]:
pd.DataFrame(X_train_std, columns=features).head()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,0.797152,-0.702114,-0.904002,0.976766,-0.509664,1.788102,-0.600387,-0.58722
1,-1.271085,-0.703758,-0.085679,-1.023787,-0.509664,-0.559252,-0.600387,1.702939
2,-0.985811,-0.73334,-0.904002,0.976766,-0.509664,-0.559252,-0.600387,-0.58722
3,0.797152,-0.41451,1.550967,-1.023787,-0.509664,-0.559252,1.665591,-0.58722
4,0.868471,1.482037,-0.085679,0.976766,-0.509664,-0.559252,-0.600387,1.702939


In [32]:
pd.DataFrame(X_train_std, columns=features).describe()

Unnamed: 0,age,bmi,children,gender_male,smoker_yes,region_northwest,region_southeast,region_southwest
count,936.0,936.0,936.0,936.0,936.0,936.0,936.0,936.0
mean,-1.791065e-16,-4.635418e-16,-2.910332e-16,1.7673420000000003e-17,8.979047e-17,-1.088873e-16,4.886879e-17,-2.3722710000000002e-17
std,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535,1.000535
min,-1.485041,-2.419524,-0.9040023,-1.023787,-0.5096643,-0.5592522,-0.6003875,-0.5872202
25%,-0.9144925,-0.7247116,-0.9040023,-1.023787,-0.5096643,-0.5592522,-0.6003875,-0.5872202
50%,-0.05867016,-0.03076297,-0.08567913,0.9767656,-0.5096643,-0.5592522,-0.6003875,-0.5872202
75%,0.8684707,0.6631857,0.732644,0.9767656,-0.5096643,-0.5592522,1.665591,1.702939
max,1.795612,3.689196,3.187613,0.9767656,1.962076,1.788102,1.665591,1.702939


In [34]:
from sklearn import linear_model

In [35]:
est = linear_model.LinearRegression()

In [37]:
est.fit(X_train_std, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [40]:
y_train_pred = est.predict(X_train_std)

In [41]:
y_test_pred = est.predict(X_test_std)

In [43]:
mse_train = ((y_train_pred - y_train) ** 2).mean()
mse_test = ((y_test_pred - y_test) ** 2).mean()

In [48]:
rmse_train = (((y_train_pred - y_train) ** 2).mean()) ** 0.5
rmse_test = (((y_test_pred - y_test) ** 2).mean()) ** 0.5

In [45]:
mse_train/ mse_test

0.992256411574651

In [47]:
mse_train

36476790.76410685

In [49]:
rmse_train, rmse_test

(6039.601871324537, 6063.122656850451)

In [50]:
y_test.std()

11919.315461904513

In [51]:
y_train.mean()

13276.698553898494

In [56]:
SST_train = ((y_train - y_train.mean()) ** 2).sum()
SST_test = ((y_test - y_test.mean()) ** 2).sum()

In [55]:
sse_train = ((y_train_pred - y_train) ** 2).sum()
sse_test = ((y_test_pred - y_test) ** 2).sum()

In [58]:
r2_train = 1 - sse_train/SST_train
r2_test = 1 - sse_test/SST_test

In [59]:
r2_train, r2_test

(0.7545557492633161, 0.7405989316927211)

In [60]:
from sklearn import metrics

In [62]:
metrics.r2_score(y_train, y_train_pred)

0.7545557492633161

In [63]:
metrics.r2_score(y_test, y_test_pred)

0.7405989316927211

In [69]:
import numpy as np

In [81]:
target = "charges"
y = np.log(df[target])
X = df.drop(columns=target)
X["age_group"] = np.where(X.age>60, "g1", "g2")
X["high_bmi"] = np.where(X.bmi>33, 1, 0)
X = pd.get_dummies(X, drop_first=True)


features = X.columns

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size = 0.3, random_state = 1)
poly = preprocessing.PolynomialFeatures(degree=4, include_bias=False)

X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

scaler = preprocessing.StandardScaler()
scaler.fit(X_train) # calculates the mean and std for each column

X_train_std = scaler.transform(X_train) # calculates the z score for each column
X_test_std = scaler.transform(X_test)

est = linear_model.LinearRegression()
est.fit(X_train_std, y_train) # calculates the coefficients

y_train_pred = est.predict(X_train_std) # calculates the estimated value
y_test_pred = est.predict(X_test_std)

rmse_train = (((y_train_pred - y_train) ** 2).mean()) ** 0.5
rmse_test = (((y_test_pred - y_test) ** 2).mean()) ** 0.5

r2_train = metrics.r2_score(y_train, y_train_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

print("r2_train: ", r2_train)
print("r2_test", r2_test)

print("rmse_train: ", rmse_train)
print("rmse_test", rmse_test)



r2_train:  0.8827348365825956
r2_test -2.71957117262123e+23
rmse_train:  0.315055188328863
rmse_test 478295453038.6203


In [65]:
est.intercept_

13276.698553898505

In [67]:
pd.DataFrame({"coefficient": est.coef_, "feature": features})

Unnamed: 0,coefficient,feature
0,3528.982731,age
1,1961.655208,bmi
2,421.550164,children
3,-141.35911,gender_male
4,9733.786883,smoker_yes
5,-129.545886,region_northwest
6,-414.541483,region_southeast
7,-379.095344,region_southwest


In [74]:
a = np.array([
    [1, 2, 3],
    [-1, 0, 1],
    [1, 1, 1],
    [4, 5, 0]
])

In [75]:
from sklearn import preprocessing

In [78]:
poly = preprocessing.PolynomialFeatures(degree=2, include_bias=False)

In [79]:
poly.fit_transform(a)

array([[ 1.,  2.,  3.,  1.,  2.,  3.,  4.,  6.,  9.],
       [-1.,  0.,  1.,  1., -0., -1.,  0.,  0.,  1.],
       [ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.],
       [ 4.,  5.,  0., 16., 20.,  0., 25.,  0.,  0.]])

In [82]:
from sklearn import pipeline

In [84]:
target = "charges"
y = np.log(df[target])
X = df.drop(columns=target)
X["age_group"] = np.where(X.age>60, "g1", "g2")
X["high_bmi"] = np.where(X.bmi>33, 1, 0)
X = pd.get_dummies(X, drop_first=True)


features = X.columns

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size = 0.3, random_state = 1)


pipe = pipeline.Pipeline([
    ("poly", preprocessing.PolynomialFeatures(degree=2, include_bias=False)),
    ("scaler", preprocessing.StandardScaler()),
    ("est", linear_model.LinearRegression())
])

pipe.fit(X_train, y_train)

y_train_pred = pipe.predict(X_train) # calculates the estimated value
y_test_pred = pipe.predict(X_test)

rmse_train = (((y_train_pred - y_train) ** 2).mean()) ** 0.5
rmse_test = (((y_test_pred - y_test) ** 2).mean()) ** 0.5

r2_train = metrics.r2_score(y_train, y_train_pred)
r2_test = metrics.r2_score(y_test, y_test_pred)

print("r2_train: ", r2_train)
print("r2_test", r2_test)

print("rmse_train: ", rmse_train)
print("rmse_test", rmse_test)



r2_train:  0.8323388378208272
r2_test 0.8673462403917107
rmse_train:  0.3767197294469794
rmse_test 0.3340456489077297
