In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [53]:
df = pd.read_csv("insurance.csv")

In [54]:
print(df.head())

   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [40]:
print(df["region"])

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object


In [55]:
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'yes':1, 'no':2})


In [56]:
print(df.head())

   age  sex     bmi  children  smoker     region      charges
0   19    0  27.900         0       1  southwest  16884.92400
1   18    1  33.770         1       2  southeast   1725.55230
2   28    1  33.000         3       2  southeast   4449.46200
3   33    1  22.705         0       2  northwest  21984.47061
4   32    1  28.880         0       2  northwest   3866.85520


In [44]:
print(df.columns)

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')


In [64]:
numerical_cols = ['age', 'bmi', 'children', 'sex', 'smoker'] 
non_numerical_cols=['region']

In [65]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])
print(scaled_data)

[[-1.43876426 -0.45332    -0.90861367 -1.0105187  -1.97058663]
 [-1.50996545  0.5096211  -0.07876719  0.98959079  0.5074631 ]
 [-0.79795355  0.38330685  1.58092576  0.98959079  0.5074631 ]
 ...
 [-1.50996545  1.0148781  -0.90861367 -1.0105187   0.5074631 ]
 [-1.29636188 -0.79781341 -0.90861367 -1.0105187   0.5074631 ]
 [ 1.55168573 -0.26138796 -0.90861367 -1.0105187  -1.97058663]]


In [68]:
encoder = OneHotEncoder(drop='first',sparse_output=False)
region_encoded = encoder.fit_transform(df[non_numerical_cols])

In [75]:
region_df = pd.DataFrame(
    region_encoded, 
    columns=encoder.get_feature_names_out(['region']),
    index=df.index
)
print(region_df)


      region_northwest  region_southeast  region_southwest
0                  0.0               0.0               1.0
1                  0.0               1.0               0.0
2                  0.0               1.0               0.0
3                  1.0               0.0               0.0
4                  1.0               0.0               0.0
...                ...               ...               ...
1333               1.0               0.0               0.0
1334               0.0               0.0               0.0
1335               0.0               1.0               0.0
1336               0.0               0.0               1.0
1337               1.0               0.0               0.0

[1338 rows x 3 columns]


In [73]:
df_processed = pd.concat([df,region_df],axis=1)
print(df_processed)

      age  sex     bmi  children  smoker     region      charges  \
0      19    0  27.900         0       1  southwest  16884.92400   
1      18    1  33.770         1       2  southeast   1725.55230   
2      28    1  33.000         3       2  southeast   4449.46200   
3      33    1  22.705         0       2  northwest  21984.47061   
4      32    1  28.880         0       2  northwest   3866.85520   
...   ...  ...     ...       ...     ...        ...          ...   
1333   50    1  30.970         3       2  northwest  10600.54830   
1334   18    0  31.920         0       2  northeast   2205.98080   
1335   18    0  36.850         0       2  southeast   1629.83350   
1336   21    0  25.800         0       2  southwest   2007.94500   
1337   61    0  29.070         0       1  northwest  29141.36030   

      region_northwest  region_southeast  region_southwest  
0                  0.0               0.0               1.0  
1                  0.0               1.0               0.0  


In [77]:
scaled_numerical_df = pd.DataFrame(
    scaled_data,
    columns=numerical_cols,
    index=df.index  # This ensures the indices match
)
print(scaled_numerical_df)

           age       bmi  children       sex    smoker
0    -1.438764 -0.453320 -0.908614 -1.010519 -1.970587
1    -1.509965  0.509621 -0.078767  0.989591  0.507463
2    -0.797954  0.383307  1.580926  0.989591  0.507463
3    -0.441948 -1.305531 -0.908614  0.989591  0.507463
4    -0.513149 -0.292556 -0.908614  0.989591  0.507463
...        ...       ...       ...       ...       ...
1333  0.768473  0.050297  1.580926  0.989591  0.507463
1334 -1.509965  0.206139 -0.908614 -1.010519  0.507463
1335 -1.509965  1.014878 -0.908614 -1.010519  0.507463
1336 -1.296362 -0.797813 -0.908614 -1.010519  0.507463
1337  1.551686 -0.261388 -0.908614 -1.010519 -1.970587

[1338 rows x 5 columns]


In [78]:
X = pd.concat([scaled_numerical_df, region_df], axis=1)

In [82]:
y =(df['charges'])

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [85]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)

In [86]:
print("Linear Regression Metrics:")
print("  MSE:", lr_mse)
print("  R² :", lr_r2)

Linear Regression Metrics:
  MSE: 33596915.85136149
  R² : 0.7835929767120722


In [87]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

In [88]:
print("\nRandom Forest Metrics:")
print("  MSE:", rf_mse)
print("  R² :", rf_r2)


Random Forest Metrics:
  MSE: 20750380.973297
  R² : 0.8663410594475837


In [89]:
best_model = rf_model if rf_r2 > lr_r2 else lr_model
print("\nSelected model:", "Random Forest" if best_model is rf_model else "Linear Regression")


Selected model: Random Forest


In [94]:
# Save the best model using pickle
with open('insurance_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

with open('encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)


print("Best model saved as insurance_model.pkl'")

Best model saved as insurance_model.pkl'


Linear Regression Metrics:
  MSE: 33596915.85136148
  R² : 0.7835929767120722

Random Forest Metrics:
  MSE: 20867794.91145395
  R² : 0.865584763811357

Selected model: Random Forest
Model and preprocessor saved as insurance_model.pkl.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder

df = pd.read_csv("insurance.csv")
print(df.head())
print(df["region"])
df['sex'] = df['sex'].map({'female': 0, 'male': 1})
df['smoker'] = df['smoker'].map({'yes':1, 'no':2})

print(df.head())
print(df.columns)
numerical_cols = ['age', 'bmi', 'children', 'sex', 'smoker'] 
non_numerical_cols=['region']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])
print(scaled_data)
encoder = OneHotEncoder(drop='first',sparse_output=False)
region_encoded = encoder.fit_transform(df[non_numerical_cols])
region_df = pd.DataFrame(
    region_encoded, 
    columns=encoder.get_feature_names_out(['region']),
    index=df.index
)
print(region_df)

df_processed = pd.concat([df,region_df],axis=1)
print(df_processed)
scaled_numerical_df = pd.DataFrame(
    scaled_data,
    columns=numerical_cols,
    index=df.index  # This ensures the indices match
)
print(scaled_numerical_df)
X = pd.concat([scaled_numerical_df, region_df], axis=1)
y =(df['charges'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)
print("Linear Regression Metrics:")
print("  MSE:", lr_mse)
print("  R² :", lr_r2)
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
print("\nRandom Forest Metrics:")
print("  MSE:", rf_mse)
print("  R² :", rf_r2)
best_model = rf_model if rf_r2 > lr_r2 else lr_model
print("\nSelected model:", "Random Forest" if best_model is rf_model else "Linear Regression")

