In [89]:
import pandas as pd
import numpy as np 
import seaborn as sns
import sklearn 

In [90]:
df = pd.read_csv('insurance.csv')

In [91]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [92]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [93]:
df['region'].unique

<bound method Series.unique of 0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object>

In [94]:
df['sex'] = df['sex'].map({ 'female' : 1 , 'male':0})

In [95]:
df['smoker'] = df['smoker'].map({ 'yes':1 , 'no':0})

In [96]:
dummies = pd.get_dummies(df['region'] , dtype=int , drop_first=True)

In [97]:
dummies

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [98]:
new_df = pd.concat([df , dummies]  , axis=1 )

In [99]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,1,27.900,0,1,southwest,16884.92400,0,0,1
1,18,0,33.770,1,0,southeast,1725.55230,0,1,0
2,28,0,33.000,3,0,southeast,4449.46200,0,1,0
3,33,0,22.705,0,0,northwest,21984.47061,1,0,0
4,32,0,28.880,0,0,northwest,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,1,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,1,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,1,25.800,0,0,southwest,2007.94500,0,0,1


In [100]:
new_df.isna().sum()

age          0
sex          0
bmi          0
children     0
smoker       0
region       0
charges      0
northwest    0
southeast    0
southwest    0
dtype: int64

In [101]:
std = new_df['charges'].std()
mean = new_df['charges'].mean()

# check outlier of target feature
anomaly_cut_off = std * 3

lower_limit = mean - anomaly_cut_off
upper_limit = mean + anomaly_cut_off

In [102]:
outlier_index = new_df[(new_df['charges'] > upper_limit) | ( new_df['charges'] < lower_limit)].index

In [103]:
outlier_index

Index([34, 543, 577, 819, 1146, 1230, 1300], dtype='int64')

In [104]:
new_df.drop( outlier_index , axis=0 , inplace=True)

In [105]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,1,27.900,0,1,southwest,16884.92400,0,0,1
1,18,0,33.770,1,0,southeast,1725.55230,0,1,0
2,28,0,33.000,3,0,southeast,4449.46200,0,1,0
3,33,0,22.705,0,0,northwest,21984.47061,1,0,0
4,32,0,28.880,0,0,northwest,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,1,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,1,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,1,25.800,0,0,southwest,2007.94500,0,0,1


In [106]:
new_df['age'].unique()

array([19, 18, 28, 33, 32, 31, 46, 37, 60, 25, 62, 23, 56, 27, 52, 30, 34,
       59, 63, 55, 22, 26, 35, 24, 41, 38, 36, 21, 48, 40, 58, 53, 43, 64,
       20, 61, 44, 57, 29, 45, 54, 49, 47, 51, 42, 50, 39], dtype=int64)

In [107]:
new_df[(new_df['age'] < 20) & new_df['children']>0].index

Index([   1,   15,   32,  106,  149,  168,  248,  270,  369,  469,  571,  636,
        710,  897,  960, 1016, 1023, 1026, 1170, 1195, 1204, 1283, 1299, 1315],
      dtype='int64')

In [108]:
new_df.drop(new_df[(new_df['age'] < 20) & new_df['children']>0].index , axis=0 , inplace=True)

In [109]:
new_df[new_df['bmi'] > 50].index

Index([847, 1047, 1317], dtype='int64')

In [110]:
new_df.drop(new_df[new_df['bmi'] > 50].index , axis=0 , inplace=True)
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,1,27.900,0,1,southwest,16884.92400,0,0,1
2,28,0,33.000,3,0,southeast,4449.46200,0,1,0
3,33,0,22.705,0,0,northwest,21984.47061,1,0,0
4,32,0,28.880,0,0,northwest,3866.85520,1,0,0
5,31,1,25.740,0,0,southeast,3756.62160,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,1,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,1,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,1,25.800,0,0,southwest,2007.94500,0,0,1


In [111]:
new_df.drop(columns=['region'] ,axis=1 , inplace=True)

In [112]:
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,1,27.900,0,1,16884.92400,0,0,1
2,28,0,33.000,3,0,4449.46200,0,1,0
3,33,0,22.705,0,0,21984.47061,1,0,0
4,32,0,28.880,0,0,3866.85520,1,0,0
5,31,1,25.740,0,0,3756.62160,0,1,0
...,...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,10600.54830,1,0,0
1334,18,1,31.920,0,0,2205.98080,0,0,0
1335,18,1,36.850,0,0,1629.83350,0,1,0
1336,21,1,25.800,0,0,2007.94500,0,0,1


In [113]:
input = new_df.drop(columns=['charges'] ,axis=1)

In [114]:
target = new_df['charges']

In [115]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression ,Lasso , Ridge
from sklearn.tree import DecisionTreeRegressor


In [116]:
cross_val_score(DecisionTreeRegressor() , input , target, cv=3).mean()

0.6796060214280767

In [117]:
cross_val_score(Ridge() , input ,target , cv=3).mean()

0.7469288467688608

In [118]:
cross_val_score(Lasso() , input ,target , cv=3).mean()

0.7469811833765618

In [119]:
cross_val_score(LinearRegression() , input ,target , cv=3).mean()

0.7469631280186779

In [120]:
from sklearn.ensemble import RandomForestRegressor

In [124]:
cross_val_score(RandomForestRegressor(n_estimators=300), input , target , cv=3).mean()
random_forest = RandomForestRegressor(n_estimators=300)

In [125]:
input

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,19,1,27.900,0,1,0,0,1
2,28,0,33.000,3,0,0,1,0
3,33,0,22.705,0,0,1,0,0
4,32,0,28.880,0,0,1,0,0
5,31,1,25.740,0,0,0,1,0
...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1,0,0
1334,18,1,31.920,0,0,0,0,0
1335,18,1,36.850,0,0,0,1,0
1336,21,1,25.800,0,0,0,0,1


In [129]:
from sklearn.model_selection import train_test_split


x_train , x_test , y_train , y_test = train_test_split(input , target ,test_size=0.2 ,random_state=2)

In [130]:
random_forest.fit(x_train , y_train)

In [135]:
random_forest.predict([[19	,1	,27.900,	0	,1	,0,	0,	1]])



array([17150.40578343])

In [133]:
input

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,19,1,27.900,0,1,0,0,1
2,28,0,33.000,3,0,0,1,0
3,33,0,22.705,0,0,1,0,0
4,32,0,28.880,0,0,1,0,0
5,31,1,25.740,0,0,0,1,0
...,...,...,...,...,...,...,...,...
1333,50,0,30.970,3,0,1,0,0
1334,18,1,31.920,0,0,0,0,0
1335,18,1,36.850,0,0,0,1,0
1336,21,1,25.800,0,0,0,0,1
