In [49]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib
from sklearn.feature_selection import f_regression
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings("ignore")

In [50]:
df = pd.read_csv("insurance_data.csv")

In [51]:
df.head()

Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,Gender,avg_glucose_level,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage,insurance_cost
0,3,1,0,2,2,162.5,4938,44,0,1,1,118,26.5,2,2004.0,74,0,1,0,0,34,29616
1,8,0,0,1,2,237.5,5306,39,0,0,1,155,38.0,0,2003.0,78,1,2,2,3,13,39488
2,8,0,0,2,4,137.5,4676,40,0,0,1,80,28.7,2,2004.0,81,0,1,1,3,16,37020
3,8,1,0,1,4,137.5,5632,45,0,0,1,126,24.4,3,2007.0,67,1,2,2,1,12,22212
4,7,1,0,2,3,137.5,4829,35,0,0,1,156,39.2,1,1994.0,74,0,2,0,2,16,25914


In [52]:
X = df.drop("insurance_cost", axis=1)
y = df["insurance_cost"]


feature_order = X.columns.tolist()


joblib.dump(feature_order, "feature_order.pkl")

['feature_order.pkl']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [54]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [55]:
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [56]:
models = {}

lr = LinearRegression()
lr.fit(X_train, y_train)
models["Linear Regression"] = lr

knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
models["KNN"] = knn

print("Models Trained Successfully")

results = {}

for name, model in models.items():
    pred = model.predict(X_test)
    results[name] = {
        "R2 Score": r2_score(y_test, pred),
        "MAE": mean_absolute_error(y_test, pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, pred))
    }

results_df = pd.DataFrame(results).T
print("\nModel Performance:")
print(results_df)

best_model_name = results_df["R2 Score"].idxmax()
best_model = models[best_model_name]

print(f"\nBEST MODEL SELECTED: {best_model_name}")

joblib.dump(best_model, "best_insurance_model.pkl")

Models Trained Successfully

Model Performance:
                   R2 Score          MAE          RMSE
Linear Regression  0.941142  2844.378985   3562.611533
KNN                0.455547  8647.020898  10835.403817

BEST MODEL SELECTED: Linear Regression


['best_insurance_model.pkl']

In [57]:
# Loading the saved model and related objects

model = joblib.load("best_insurance_model.pkl") 
scaler = joblib.load("scaler.pkl")
feature_order = joblib.load("feature_order.pkl")    
label_encoders = joblib.load("label_encoders.pkl")

In [71]:
sample_data = {
    "bmi": 25.0,
    "smoking_status": 1,
    "Year_last_admitted": 2008,
    "weight": 70.0,
    "covered_by_any_other_company": 1,
    "Alcohol":	1,
    "exercise":	0,
    "weight_change_in_last_one_year": 3,
    "fat_percentage": 15,
    "years_of_insurance_with_us": 2,
    "regular_checkup_lasy_year": 1,
    "adventure_sports": 1,
    "Occupation": 1,
    "visited_doctor_last_1_year": 1,
    "cholesterol_level": 120,
    "daily_avg_steps": 7000,
    "age": 30,
    "heart_decs_history": 1,
    "other_major_decs_history": 1,
    "Gender": 0,
    "avg_glucose_level": 90,
}

In [72]:
sample_df = pd.DataFrame([sample_data])
sample_df

Unnamed: 0,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,Gender,avg_glucose_level
0,25.0,1,2008,70.0,1,1,0,3,15,2,1,1,1,1,120,7000,30,1,1,0,90


In [73]:
sample_df = sample_df[feature_order]
sample_df

Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,Gender,avg_glucose_level,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage
0,2,1,1,1,1,120,7000,30,1,1,0,90,25.0,1,2008,70.0,1,1,0,3,15


In [74]:
for col, encoder in label_encoders.items():
    valid = encoder.classes_
    sample_df[col] = sample_df[col].apply(
        lambda x: x if x in valid else valid[0]
    )
    sample_df[col] = encoder.transform(sample_df[col])

In [79]:
sample_df

Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,Gender,avg_glucose_level,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage
0,2,1,1,0,1,120,7000,30,1,1,0,90,25.0,0,2008,70.0,0,0,0,3,15


In [75]:
sample_scaled = scaler.transform(sample_df)

In [78]:
prediction = model.predict(sample_scaled)
print("Predicted Insurance Cost:", prediction[0])

Predicted Insurance Cost: 123979.20279234345
