## INTRODUCTION 

In [2]:
import pandas as pd    
import numpy as np          
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error,root_mean_squared_error,mean_absolute_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import pickle

In [3]:
df = pd.read_csv("insurance.csv")

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


## PREPROCESSING 

In [28]:
def preprocess_inputs(df):
    df = df.copy()

    # splitting the dataset into X and y
    X = df.drop('expenses', axis=1)
    y = df['expenses']

    # train test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

    # categorical columns
    cat_cols = ['sex', 'smoker', 'region']

    # numerical columns
    num_cols = ['age', 'bmi', 'children']

    # one hot encode categorical columns
    encoder = OneHotEncoder(drop='first', sparse_output=False)


    x_train_encode = encoder.fit_transform(x_train[cat_cols])
    x_test_encode = encoder.transform(x_test[cat_cols])

    x_train_encoded = pd.DataFrame(x_train_encode, index=x_train.index, columns=encoder.get_feature_names_out(cat_cols))
    x_test_encoded = pd.DataFrame(x_test_encode, index=x_test.index, columns=encoder.get_feature_names_out(cat_cols))

    # scale numerical columns
    scaler = StandardScaler()
    
    x_train_scale = scaler.fit_transform(x_train[num_cols])
    x_test_scale = scaler.transform(x_test[num_cols])

    x_train_scaled = pd.DataFrame(x_train_scale, index=x_train.index, columns=num_cols)
    x_test_scaled = pd.DataFrame(x_test_scale, index=x_test.index, columns=num_cols)

    # concatenate numerical and categorical features
    x_train_processed = pd.concat([x_train_scaled, x_train_encoded], axis=1)
    x_test_processed = pd.concat([x_test_scaled, x_test_encoded], axis=1)

    return x_train_processed, x_test_processed, y_train, y_test, encoder, scaler

In [29]:
x_train_processed, x_test_processed, y_train, y_test, encoder, scaler = preprocess_inputs(df)

# LINEAR REGRESSION

In [18]:
model_lr = LinearRegression()
model_lr.fit(x_train_processed,y_train)

y_pred = model_lr.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 32193435.27377557
r2_score: 0.7946953084832675
Root Mean Square Error: 5673.925913666442
Mean Absolute Error: 3916.3077180168257


##### The model predict 79.47% of the variance in the target variable (R² = 0.7947), reflecting strong prediction. However, the significant error metrics (MSE = 32193435.27, RMSE = 5673.93, MAE = 3916.31) indicate the improvement of this model by addressing outliers.

# DECISION TREE

In [19]:
tree_model = DecisionTreeRegressor()
tree_model.fit(x_train_processed, y_train)

y_pred = tree_model.predict(x_test_processed)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 43664494.76376082
r2_score: 0.7215418127493122
Root Mean Square Error: 6607.911528142672
Mean Absolute Error: 3360.137052238806


##### The DecisionTreepredict 71.50% of the variance in the target variable (R² = 0.7150), reflecting moderate efficacy. However, the elevated error metrics (MSE = 44698118.42, RMSE = 6685.67, MAE = 3415.93) underscore the potential for optimization, which could be achieved through advanced hyperparameter tuning or the integration of more sophisticated modeling approaches.

# KNEIGHBORS REGRESSOR

In [20]:
knn_model = KNeighborsRegressor() 
knn_model.fit(x_train_processed, y_train)

y_pred = knn_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 44047140.7407018
r2_score: 0.7191015943138481
Root Mean Square Error: 6636.801996496642
Mean Absolute Error: 4122.186604477612


##### The KNeighborsRegressor predict 71.91% of the variance in the target variable (R² = 0.7191), indicating solid performance.  The high error metrics (MSE = 44047140.74, RMSE = 6636.80, MAE = 4122.19) suggest that further refinement is needed, possibly through hyperparameter tuning or considering more complex models.

# RANDOM FOREST REGRESSOR 

In [21]:
rf_model = RandomForestRegressor()
rf_model.fit(x_train_processed, y_train)

y_pred = rf_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 20663269.492154222
r2_score: 0.8682257381749646
Root Mean Square Error: 4545.686910924929
Mean Absolute Error: 2798.117290783582


##### The RandomForestRegressor predict 87.44% of the variance in the target variable (R² = 0.8744), demonstrating strong predictive accuracy. The relatively low error metrics (MSE = 19699235.16, RMSE = 4438.38, MAE = 2732.44) indicate that the model performs well with minimal prediction errors.

# GRADIENT BOOSTING REGRESSOR

In [22]:
gbr_model = GradientBoostingRegressor()
gbr_model.fit(x_train_processed, y_train)

y_pred = gbr_model.predict(x_test_processed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)


print("Mean Squared Error (MSE):", mse)
print("r2_score:",r2)
print("Root Mean Square Error:",rmse)
print("Mean Absolute Error:",mae)

Mean Squared Error (MSE): 15928005.850588573
r2_score: 0.898423566797931
Root Mean Square Error: 3990.990585129032
Mean Absolute Error: 2385.4712512154365


##### The GradientBoostingRegressor predict 89.97% of the variance in the target variable (R² = 0.8997), reflecting excellent performance. With low error metrics (MSE = 15728546.25, RMSE = 3965.92, MAE = 2362.96), the model demonstrates high accuracy and robustness in predictions.

# OVER ALL PREDICTIONS 

##### A significantly higher r2 value in GradientBoostingRegressor compared to other regressors indicates that it more effectively captures complex patterns in our data, making it the superior model for prediction accurate and robust choice for predictive performance.








In [23]:
with open("encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [24]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

In [25]:
with open("model.pkl", "wb") as f:
    pickle.dump(gbr_model, f)