In [83]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

In [84]:
df = pd.read_csv('medical-costs.csv')

df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [85]:
mean_charges=df['charges'].mean()
mean_charges


13270.422265141257

In [86]:


from pandas_profiling import ProfileReport

eda_report = ProfileReport(df, minimal=True)

eda_report.to_widgets()



HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=16.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [87]:

duplicateRowsDF = df[df.duplicated(keep = False)]
 
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)

Duplicate Rows except first occurrence based on all columns are :
     age   sex    bmi  children smoker     region    charges
195   19  male  30.59         0     no  northwest  1639.5631
581   19  male  30.59         0     no  northwest  1639.5631


In [89]:
df.drop_duplicates(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [90]:
df_num = df.select_dtypes(exclude ='object')
num_features = df_num.columns.tolist()
num_features.remove('charges')


df_cat = df.select_dtypes(include ='object') 

cat_features = df_cat.columns.tolist()




In [91]:
cat_features

['sex', 'smoker', 'region']

In [92]:
num_features

['age', 'bmi', 'children']

In [93]:
X = df.drop(['charges'], axis=1)
y = df['charges']
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
dtypes: float64(1), int64(2), object(3)
memory usage: 73.1+ KB


In [94]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=31)

print(X_train.shape)
print(X_test.shape)


(1203, 6)
(134, 6)


In [95]:


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
num_pipeline = Pipeline(
    steps=[
        ('num_imputer', SimpleImputer()),
        ('scaler', StandardScaler()),
        ]
)
cat_pipeline = Pipeline(
    steps=[
        ('cat_imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown="ignore")),
    ]
)
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_features),
        ('cat_pipeline', cat_pipeline, cat_features),
    ]
)



In [96]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
r1 = LinearRegression()
r2 =SVR()
r3=SGDRegressor(max_iter= 100000)

voting_reg_pipe = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('voting_reg', VotingRegressor([('lr', r1),('svm', r2),('sgd', r3)])),
    ]
)


In [97]:
voting_reg_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num_pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('num_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                  

In [98]:
voting_pred=voting_reg_pipe.predict(X_test)

In [102]:
from sklearn.metrics import mean_squared_error
lin_mse = mean_squared_error(y_test, voting_pred) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

6046.689168352445

In [103]:
from sklearn.ensemble import RandomForestRegressor

random_forest_reg_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
       ('random_forest_reg', RandomForestRegressor()),
    ]
)

In [106]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        
        'random_forest_reg__max_depth': [2,5,10, 20, 30],
        'random_forest_reg__n_estimators': [200, 400,600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
        
    }
]
# set up the grid search 
grid_search = GridSearchCV(random_forest_reg_pipeline, param_grid, cv=5,
                          scoring='neg_root_mean_squared_error',
                          return_train_score=True)

# train the model using the full pipeline
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [108]:
best_random_forest_reg = grid_search.best_estimator_

In [109]:
grid_search.best_params_

{'random_forest_reg__max_depth': 5, 'random_forest_reg__n_estimators': 2000}

In [110]:
pred_random_forest = best_random_forest_reg.predict(X_test)

In [111]:

lin_mse = mean_squared_error(y_test, pred_random_forest) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3449.792327079637

In [112]:

from sklearn.ensemble import GradientBoostingRegressor

gbrt_pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
       ('gbrt', GradientBoostingRegressor()),
    ]
)

In [113]:
from sklearn.model_selection import GridSearchCV

param_grid2 = [
    {
        
        'gbrt__max_depth': [2,5,10, 20, 30],
        'gbrt__n_estimators': [200, 400,600, 800, 1000, 1200, 1400, 1600, 1800, 2000]
        
    }
]
# set up the grid search 
grid_search2 = GridSearchCV(gbrt_pipeline, param_grid2, cv=5,
                          scoring='neg_root_mean_squared_error',
                          return_train_score=True)

# train the model using the full pipeline
grid_search2.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num_pipeline',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('num_imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                       

In [114]:
grid_search2.best_params_

{'gbrt__max_depth': 2, 'gbrt__n_estimators': 200}

In [115]:
best_gbrt = grid_search2.best_estimator_

In [116]:
pred_gbrt = best_gbrt.predict(X_test)

In [117]:
lin_mse = mean_squared_error(y_test, pred_gbrt) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

3233.327878144481

In [120]:
naive_pred=np.full((134), mean_charges)
lin_mse = mean_squared_error(y_test, naive_pred) 
lin_rmse = np.sqrt(lin_mse)
lin_rmse

11749.736729889188

In [124]:
from sklearn.model_selection import cross_val_score

voting_reg_rmse = cross_val_score(voting_reg_pipe, X_train, y_train, scoring="neg_root_mean_squared_error" ,cv=10)
best_random_forest_reg_rmse = cross_val_score(best_random_forest_reg, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)
best_gbrt_rmse = cross_val_score(best_gbrt, X_train, y_train, scoring="neg_root_mean_squared_error", cv=10)

 
print(f'voting_reg cv average rmse: {voting_reg_rmse.mean()}')
print(f'best_random_forest_reg cv average rmse: {best_random_forest_reg_rmse.mean()}')
print(f'best_gbrt cv average rmse: {best_gbrt_rmse.mean()}')


voting_reg cv average rmse: -7178.72640150221
best_random_forest_reg cv average rmse: -4607.498454232431
best_gbrt cv average rmse: -4659.678526594259


In this context the rmse is the  root of the average squared error, the error is the actual medical cost minus the medical cost I predicted. 3233(Gbrt) is quite good for rmse when you have an average cost of 13000 so my prediction has an avergae rmse which is less than 30 percent of the mean.
For my y_test the GradientBoostingRegressor works best, but in the cross vall score the random forest reg works best (in cross val the rsme is: 4607.498454232431, thats 40 % of my mean), 
I would say it predicted way better(about 8000 less of rmse) than my naive prediction , and thats quite good for me I think. Also my rmse is way below my standard  deviation, its only about one 4th(one third from the score of crossval)

In [126]:

df.loc[:,"charges"].std()

12110.359656344175