In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [3]:
df = pd.read_csv('insurance.csv')

In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(df.drop('smoker',axis=1),
                                                 df['smoker'],test_size=0.2,
                                                 random_state=0)

In [6]:
x_train.shape,x_test.shape

((1070, 6), (268, 6))

In [7]:
y_train.shape,y_test.shape

((1070,), (268,))

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
df['region'].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [10]:
df['children'].value_counts()

children
0    574
1    324
2    240
3    157
4     25
5     18
Name: count, dtype: int64

In [11]:
from sklearn.compose import ColumnTransformer

In [12]:
#transformer1 = ColumnTransformer(transformers=[OrdinalEncoder(categories=[['no','yes']]),['smoker']])

transformer2 = ColumnTransformer(transformers=[('tnf2',OneHotEncoder(sparse_output=False,drop='first' )
                                                ,['sex','region'])]
                                ,remainder='passthrough')

In [13]:
x_train_transformed = transformer2.fit_transform(x_train)
x_test_transformed = transformer2.fit_transform(x_test)
#y_train_transformed = transformer1.fit_transform(y_train)
#y_test_transformed = transformer1.fit_transform(y_test)

In [14]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_transformed = label_encoder.fit_transform(y_train)
y_test_transformed = label_encoder.fit_transform(y_test)

In [15]:
x_train_transformed

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.41000000e+01, 4.00000000e+00, 4.01822460e+04],
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        3.44300000e+01, 0.00000000e+00, 1.13746970e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.66700000e+01, 2.00000000e+00, 3.85116283e+04],
       ...,
       [1.00000000e+00, 0.00000000e+00, 1.00000000e+00, ...,
        2.50800000e+01, 0.00000000e+00, 5.41566120e+03],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        3.55300000e+01, 0.00000000e+00, 1.64642970e+03],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.85000000e+01, 1.00000000e+00, 4.76602200e+03]])

In [16]:
y_train_transformed

array([1, 0, 1, ..., 0, 0, 0])

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train_transformed)
x_train_scaled = scaler.transform(x_train_transformed)
x_test_scaled = scaler.transform(x_test_transformed)

In [18]:
x_train_scaled

array([[ 1.01506676, -0.55791981, -0.6155209 , ...,  0.54530479,
         2.41394802,  2.25338687],
       [ 1.01506676, -0.55791981,  1.6246402 , ...,  0.59867181,
        -0.89219519, -1.00752925],
       [-0.98515688, -0.55791981, -0.6155209 , ...,  0.96092064,
         0.76087642,  2.11386131],
       ...,
       [ 1.01506676, -0.55791981,  1.6246402 , ..., -0.91339361,
        -0.89219519, -0.65022604],
       [ 1.01506676,  1.79237229, -0.6155209 , ...,  0.77656186,
        -0.89219519, -0.96502226],
       [-0.98515688, -0.55791981, -0.6155209 , ..., -1.97749955,
        -0.06565939, -0.70448219]])

In [19]:
from sklearn.linear_model import LinearRegression


In [20]:
lr = LinearRegression()


In [21]:
lr.fit(x_train_scaled,y_train_transformed)

In [25]:
lr.predict(x_test_scaled)

array([ 4.75104326e-03,  4.03581092e-02,  9.80631120e-01, -5.59631797e-02,
        1.31432016e-01,  8.96632112e-02,  8.37575650e-02, -4.11051523e-02,
        1.25544791e-02,  8.11035750e-03,  1.41102659e-01,  5.75639433e-02,
       -3.27367231e-03,  8.59982138e-02,  4.80766774e-01,  7.41044801e-02,
        9.01023632e-02, -5.63563024e-03,  1.22114466e-02,  9.64808934e-01,
        4.78217735e-01, -7.73051324e-03,  3.85027094e-01,  4.99235432e-01,
        9.27649030e-03, -7.03555456e-02,  1.46565364e-01,  1.19177356e-02,
        7.38356747e-02,  1.43893073e-03,  2.84844642e-02,  1.03482446e+00,
       -4.54351871e-02,  2.39416641e-01,  4.66792116e-01,  2.85673327e-02,
       -4.99323232e-02,  1.36527552e+00,  9.46282590e-01, -9.24867821e-03,
        1.06697087e-01,  5.15407488e-02,  4.62106725e-01,  9.93108298e-01,
        1.02353414e+00,  6.82054615e-01,  3.47215923e-01,  3.60723383e-02,
        1.00297909e-01,  4.25037651e-01, -2.69058798e-02,  5.25224021e-01,
        4.72744255e-01,  

In [24]:
r2score = lr.score(x_test_scaled,y_test_transformed)
print(r2score*100,'%')

75.820150198842 %
