In [168]:
import pandas as pd
import numpy as np 
import seaborn as sns
import sklearn 

In [169]:
df = pd.read_csv('insurance.csv')

In [170]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [171]:
df['region'].unique

<bound method Series.unique of 0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object>

In [172]:
df['sex'] = df['sex'].map({ 'female' : 1 , 'male':0})

In [173]:
df['smoker'] = df['smoker'].map({ 'yes':1 , 'no':0})

In [174]:
dummies = pd.get_dummies(df['region'] , dtype=int , drop_first=True)

In [175]:
dummies

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [176]:
new_df = pd.concat([df , dummies]  , axis=1 )

In [177]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,1,27.900,0,1,southwest,16884.92400,0,0,1
1,18,0,33.770,1,0,southeast,1725.55230,0,1,0
2,28,0,33.000,3,0,southeast,4449.46200,0,1,0
3,33,0,22.705,0,0,northwest,21984.47061,1,0,0
4,32,0,28.880,0,0,northwest,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,1,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,1,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,1,25.800,0,0,southwest,2007.94500,0,0,1


In [178]:
new_df.isna().sum()

age          0
sex          0
bmi          0
children     0
smoker       0
region       0
charges      0
northwest    0
southeast    0
southwest    0
dtype: int64

In [179]:
new_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        1338 non-null   int64  
 1   sex        1338 non-null   int64  
 2   bmi        1338 non-null   float64
 3   children   1338 non-null   int64  
 4   smoker     1338 non-null   int64  
 5   region     1338 non-null   object 
 6   charges    1338 non-null   float64
 7   northwest  1338 non-null   int32  
 8   southeast  1338 non-null   int32  
 9   southwest  1338 non-null   int32  
dtypes: float64(2), int32(3), int64(4), object(1)
memory usage: 89.0+ KB


In [180]:
std = new_df['charges'].std()
mean = new_df['charges'].mean()

# check outlier of target feature
anomaly_cut_off = std * 3

lower_limit = mean - anomaly_cut_off
upper_limit = mean + anomaly_cut_off

In [181]:
outlier_index = new_df[(new_df['charges'] > upper_limit) | ( new_df['charges'] < lower_limit)].index

In [182]:
new_df.drop( outlier_index , axis=0 , inplace=True)

In [195]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,19,1,27.900,0,1,0,0,1
1,18,0,33.770,1,0,0,1,0
2,28,0,33.000,3,0,0,1,0
3,33,0,22.705,0,0,1,0,0
4,32,0,28.880,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1,0,0
1334,18,1,31.920,0,0,0,0,0
1335,18,1,36.850,0,0,0,1,0
1336,21,1,25.800,0,0,0,0,1


In [194]:
input = new_df.drop(columns=['charges','region']  , axis=1, inplace=True)

KeyError: "['charges', 'region'] not found in axis"

None


In [189]:
target = new_df['charges']

KeyError: 'charges'

In [130]:
target

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [132]:
target

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [167]:
from sklearn.model_selection import train_test_split
x_test , x_train , y_test , y_train = train_test_split( new_df , target , test_size=0.2 , random_state=2)

ValueError: Found input variables with inconsistent numbers of samples: [1331, 1338]

In [134]:
from sklearn.linear_model import LogisticRegression , LinearRegression ,Lasso , Ridge

In [135]:
linear = LinearRegression()
lasso = Lasso()
ridge = Ridge()



In [136]:
ridge.fit(x_train ,y_train)

In [137]:
linear.fit(x_train , y_train)


In [138]:
linear.score(x_test , y_test)

0.7456723273816117

In [139]:
lasso.fit(x_train ,y_train)

In [140]:
lasso.score(x_test , y_test)

0.7456764720803533

In [141]:
ridge.score(x_test , y_test)

0.7452144907750142

In [142]:
from sklearn.tree import DecisionTreeRegressor

In [143]:
tree = DecisionTreeRegressor( )

In [144]:
tree.fit(x_train , y_train)

In [145]:
tree.score(x_test , y_test)

0.7360177069418476

In [146]:
from sklearn.model_selection import cross_val_score

In [147]:
cross_val_score(LinearRegression() , input , target , cv=10)

array([0.78599896, 0.73249853, 0.73491249, 0.66880787, 0.77174242,
       0.78401994, 0.79314562, 0.66869189, 0.74242455, 0.76276472])