In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    OrdinalEncoder,
    OneHotEncoder,
)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error, 
    r2_score,
    
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("C:/Users/Sande/Desktop/Datasets/medical_insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.shape

(2772, 7)

In [4]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [5]:

df.duplicated().sum()

np.int64(1435)

In [6]:
df=df.drop_duplicates()

In [7]:
df.shape

(1337, 7)

In [8]:
df.columns 

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [9]:
df.dtypes  


age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [10]:
df['sex'].value_counts() 

sex
male      675
female    662
Name: count, dtype: int64

In [11]:
df['smoker'].value_counts() 


smoker
no     1063
yes     274
Name: count, dtype: int64

In [12]:
df['region'].value_counts() 


region
southeast    364
southwest    325
northwest    324
northeast    324
Name: count, dtype: int64

In [13]:
train_data, test_data = train_test_split(df, random_state=42, test_size=0.2) 


In [14]:
X_train = train_data.iloc[:, 0:-1].values
y_train = train_data.iloc[:,-1].values

X_test = test_data.iloc[:, 0:-1].values
y_test = test_data.iloc[:,-1].values  


In [15]:
df.sample(1) 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
708,31,female,30.495,3,no,northeast,6113.23105


In [16]:
for i, col in enumerate(df.columns):
    print(f"{i}, {col}") 


0, age
1, sex
2, bmi
3, children
4, smoker
5, region
6, charges


In [17]:
Pipeline  = ColumnTransformer(transformers=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'),[1,5,4]),
    ('Scaler', StandardScaler(), [0, 2])
])


In [18]:
pipe = make_pipeline(Pipeline, RandomForestRegressor())
pipe


In [19]:
pipe.fit(X_train, y_train) 

In [20]:
y_pred = pipe.predict(X_test)

In [21]:
acc = r2_score(y_test, y_pred)
print(acc*100)

88.90191944964255
