In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

## Reading the Dataset

In [2]:
train_df = pd.read_csv('insurance.csv')
train_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## Dividing the Dataset
### the Xval is further divided into validation and test

In [3]:
X_train,X_val_test = train_test_split(train_df,test_size=0.3,random_state=0)

In [4]:
X_val,X_test = train_test_split(X_val_test,test_size=0.1,random_state=22)

In [5]:
# X_train = X_train.reset_index(drop=True)
# X_val = X_val.reset_index(drop=True)
# X_test = X_test.reset_index(drop=True)

In [6]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
1163,18,female,28.2,0,no,northeast,2200.83
196,39,female,32.8,0,no,southwest,5649.72
438,52,female,46.8,5,no,southeast,12592.53
183,44,female,26.4,0,no,northwest,7419.48
1298,33,male,27.5,2,no,northwest,5261.47


### Classification of numeric and categorical data

In [7]:
numeric_feat = ['age','bmi','children']
categorical_feat = ['sex','smoker','region']

## Cleaning the Data

In [8]:
numeric_transform = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy="mean")),
    ('scaler',StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',numeric_transform,numeric_feat),
        ('cat',categorical_transformer,categorical_feat)
    ]
)

## Model Pipeline

In [9]:
model = Pipeline(steps=[
    ('preprocess',preprocessor),
    ('model',RandomForestRegressor(n_estimators=150,random_state=22))
])

## features for train and validation

In [10]:
X_feat =  X_train.iloc[:,:-1]
Y_train_label = X_train['expenses']

X_val_feat = X_val.iloc[:,:-1]
Y_val_labels = X_val['expenses']

X_feat.head(),Y_train_label.head()

(      age     sex   bmi  children smoker     region
 1163   18  female  28.2         0     no  northeast
 196    39  female  32.8         0     no  southwest
 438    52  female  46.8         5     no  southeast
 183    44  female  26.4         0     no  northwest
 1298   33    male  27.5         2     no  northwest,
 1163     2200.83
 196      5649.72
 438     12592.53
 183      7419.48
 1298     5261.47
 Name: expenses, dtype: float64)

## fitting the Model

In [11]:
model.fit(X_feat,Y_train_label)

In [12]:
y_pred = model.predict(X_val_feat)

In [13]:
Result = r2_score(y_pred,Y_val_labels)

In [14]:
print("The r2 score of the model is",Result)

The r2 score of the model is 0.8382163625345749


## Evaluating the model on test data and writing a new csv file

In [20]:
X_test_feat = X_test.iloc[:,:-1]
print(X_test_feat.head())
final_prediction = model.predict(X_test_feat)

      age     sex   bmi  children smoker     region
49     36    male  35.2         1    yes  southeast
1157   23  female  23.2         2     no  northwest
1059   32    male  33.8         1     no  northwest
1293   46    male  25.7         3     no  northwest
312    43    male  36.0         3    yes  southeast


In [23]:
X_test_feat = X_test_feat.reset_index() # shifts the index value and creates a new column called 'index'

In [25]:
out = pd.DataFrame({'Unique_ID':X_test_feat['index'],'expenses':final_prediction})
out.head()
## if you want to you can write into csv file.

Unnamed: 0,Unique_ID,expenses
0,49,39656.544667
1,1157,8115.204133
2,1059,9755.826133
3,1293,11298.548933
4,312,41799.2546
