In [55]:
import pandas as pd

In [56]:
train = pd.read_csv('insurance.csv')

In [57]:
train.info()

<class 'pandas.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   str    
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   str    
 5   region    1338 non-null   str    
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), str(3)
memory usage: 73.3 KB


In [58]:
train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [59]:
train['sex'].unique()

<StringArray>
['female', 'male']
Length: 2, dtype: str

In [60]:
train['region'].unique()

<StringArray>
['southwest', 'southeast', 'northwest', 'northeast']
Length: 4, dtype: str

In [61]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [62]:
le_smoker = LabelEncoder()
le_sex = LabelEncoder()
train['smoker'] = le_smoker.fit_transform(train['smoker'])
train['sex'] = le_smoker.fit_transform(train['sex'])

In [63]:
train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [64]:
ohe = OneHotEncoder(drop="first", sparse_output=False)
train["region"] = train["region"]

deck_encoded = ohe.fit_transform(train[["region"]])

deck_df = pd.DataFrame(
    deck_encoded,
    columns=ohe.get_feature_names_out(["region"])
)

train = pd.concat([train.drop(columns=["region"]), deck_df], axis=1)

In [65]:
train.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0.0,0.0,1.0
1,18,1,33.77,1,0,1725.5523,0.0,1.0,0.0
2,28,1,33.0,3,0,4449.462,0.0,1.0,0.0
3,33,1,22.705,0,0,21984.47061,1.0,0.0,0.0
4,32,1,28.88,0,0,3866.8552,1.0,0.0,0.0


In [66]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

model = LinearRegression()

X = train.drop(columns=['charges'])
y = train['charges']

X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    random_state=42,
    test_size=0.2
)

In [67]:
model.fit(X_train,y_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [70]:
from sklearn.metrics import mean_squared_error
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
print("MSE:", mse)


MSE: 33596915.851361446


In [71]:
y_test.describe()


count      268.000000
mean     12968.317063
std      12483.196202
min       1131.506600
25%       4288.744850
50%       8487.880300
75%      16367.829375
max      63770.428010
Name: charges, dtype: float64

In [72]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, pred)
r2


0.7835929767120724

In [73]:
import numpy as np

y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

model.fit(X_train, y_train_log)

pred_log = model.predict(X_test)
pred = np.expm1(pred_log)


In [75]:
from sklearn.metrics import r2_score

r2_log = r2_score(y_test_log, pred_log)
print("R² (log scale):", r2_log)


R² (log scale): 0.8047410791393022
