In [36]:
# Feature Enggineering 

In [37]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_selection import f_regression


In [38]:
df = pd.read_csv("Data.csv")

In [39]:
df.head()

Unnamed: 0,applicant_id,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,...,smoking_status,Year_last_admitted,Location,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage,insurance_cost
0,5000,3,1,1,Salried,2,125 to 150,4866,28,1,...,Unknown,,Chennai,67,N,Rare,Moderate,1,25,20978
1,5001,0,0,0,Student,4,150 to 175,6411,50,0,...,formerly smoked,,Jaipur,58,N,Rare,Moderate,3,27,6170
2,5002,1,0,0,Business,4,200 to 225,4509,68,0,...,formerly smoked,,Jaipur,73,N,Daily,Extreme,0,32,28382
3,5003,7,4,0,Business,2,175 to 200,6214,51,0,...,Unknown,,Chennai,71,Y,Rare,No,3,37,27148
4,5004,3,1,0,Student,2,150 to 175,4938,44,0,...,never smoked,2004.0,Bangalore,74,N,No,Extreme,0,34,29616


In [40]:
print(df.columns)

Index(['applicant_id', 'years_of_insurance_with_us',
       'regular_checkup_lasy_year', 'adventure_sports', 'Occupation',
       'visited_doctor_last_1_year', 'cholesterol_level', 'daily_avg_steps',
       'age', 'heart_decs_history', 'other_major_decs_history', 'Gender',
       'avg_glucose_level', 'bmi', 'smoking_status', 'Year_last_admitted',
       'Location', 'weight', 'covered_by_any_other_company', 'Alcohol',
       'exercise', 'weight_change_in_last_one_year', 'fat_percentage',
       'insurance_cost'],
      dtype='object')


In [41]:
df = df.drop(columns=["applicant_id","Location"], errors="ignore")

In [42]:
df = df.dropna()

In [43]:
print(df.columns)

Index(['years_of_insurance_with_us', 'regular_checkup_lasy_year',
       'adventure_sports', 'Occupation', 'visited_doctor_last_1_year',
       'cholesterol_level', 'daily_avg_steps', 'age', 'heart_decs_history',
       'other_major_decs_history', 'Gender', 'avg_glucose_level', 'bmi',
       'smoking_status', 'Year_last_admitted', 'weight',
       'covered_by_any_other_company', 'Alcohol', 'exercise',
       'weight_change_in_last_one_year', 'fat_percentage', 'insurance_cost'],
      dtype='object')


In [44]:
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    print(f"{col} → {df[col].nunique()} unique values")


Occupation → 3 unique values
cholesterol_level → 5 unique values
Gender → 2 unique values
smoking_status → 4 unique values
covered_by_any_other_company → 2 unique values
Alcohol → 3 unique values
exercise → 3 unique values


In [45]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
le = LabelEncoder()

for col in df_encoded.columns:
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

df_encoded.head()


Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,...,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage,insurance_cost
4,0,1,0,2,4,1,1591,28,0,1,...,117,2,14,22,0,1,0,0,23,15
5,5,0,0,1,4,4,1956,23,0,0,...,232,0,13,26,1,2,2,3,2,24
6,5,0,0,2,6,0,1330,24,0,0,...,139,2,14,29,0,1,1,3,5,22
8,5,1,0,1,6,0,2279,29,0,0,...,96,3,17,15,1,2,2,1,1,9
10,4,1,0,2,5,0,1482,19,0,0,...,244,1,4,22,0,2,0,2,5,12


In [46]:

# Get all numeric columns AFTER encoding
numeric_cols = df_encoded.select_dtypes(include=['int64','float64']).columns

# Remove the target column from feature list
numeric_cols = numeric_cols.drop("insurance_cost")

# Define X and y
X = df_encoded[numeric_cols]
y = df_encoded["insurance_cost"]

# Run ANOVA F-test
f_scores, p_values = f_regression(X, y)

# Results
anova_results = pd.DataFrame({
    "Feature": numeric_cols,
    "F_score": f_scores,
    "p_value": p_values
}).sort_values("F_score", ascending=False)

anova_results


Unnamed: 0,Feature,F_score,p_value
19,weight_change_in_last_one_year,463.651149,5.0854070000000004e-101
15,weight,49.698033,1.88692e-12
2,adventure_sports,18.634346,1.595533e-05
1,regular_checkup_lasy_year,7.315523,0.006845352
0,years_of_insurance_with_us,7.193805,0.007325124
20,fat_percentage,6.284724,0.01219081
16,covered_by_any_other_company,4.467678,0.03456153
10,Gender,4.280314,0.03857661
11,avg_glucose_level,2.031349,0.1541086
9,other_major_decs_history,1.815447,0.1778804


In [47]:
selected_features = anova_results[
    (anova_results["p_value"] < 0.05)
].sort_values("F_score", ascending=False)

removed_features = anova_results[
    (anova_results["p_value"] >= 0.05)
].sort_values("F_score")

In [48]:
print("Features to KEEP:")
print(selected_features)

print("\nFeatures to REMOVE:")
print(removed_features)

Features to KEEP:
                           Feature     F_score        p_value
19  weight_change_in_last_one_year  463.651149  5.085407e-101
15                          weight   49.698033   1.886920e-12
2                 adventure_sports   18.634346   1.595533e-05
1        regular_checkup_lasy_year    7.315523   6.845352e-03
0       years_of_insurance_with_us    7.193805   7.325124e-03
20                  fat_percentage    6.284724   1.219081e-02
16    covered_by_any_other_company    4.467678   3.456153e-02
10                          Gender    4.280314   3.857661e-02

Features to REMOVE:
                       Feature   F_score   p_value
7                          age  0.021783  0.882669
12                         bmi  0.089193  0.765211
13              smoking_status  0.130077  0.718358
8           heart_decs_history  0.202661  0.652589
6              daily_avg_steps  0.212462  0.644852
4   visited_doctor_last_1_year  0.228016  0.633007
17                     Alcohol  0.228477  0.63

In [49]:
## After ANOVA will keep all features because it should be based on domain expertise

In [50]:
## ML Model

In [60]:
df.head()

Unnamed: 0,years_of_insurance_with_us,regular_checkup_lasy_year,adventure_sports,Occupation,visited_doctor_last_1_year,cholesterol_level,daily_avg_steps,age,heart_decs_history,other_major_decs_history,...,bmi,smoking_status,Year_last_admitted,weight,covered_by_any_other_company,Alcohol,exercise,weight_change_in_last_one_year,fat_percentage,insurance_cost
4,3,1,0,Student,2,150 to 175,4938,44,0,1,...,26.5,never smoked,2004.0,74,N,No,Extreme,0,34,29616
5,8,0,0,Salried,2,225 to 250,5306,39,0,0,...,38.0,Unknown,2003.0,78,Y,Rare,No,3,13,39488
6,8,0,0,Student,4,125 to 150,4676,40,0,0,...,28.7,never smoked,2004.0,81,N,No,Moderate,3,16,37020
8,8,1,0,Salried,4,125 to 150,5632,45,0,0,...,24.4,smokes,2007.0,67,Y,Rare,No,1,12,22212
10,7,1,0,Student,3,125 to 150,4829,35,0,0,...,39.2,formerly smoked,1994.0,74,N,Rare,Extreme,2,16,25914


In [57]:
df.columns

Index(['years_of_insurance_with_us', 'regular_checkup_lasy_year',
       'adventure_sports', 'Occupation', 'visited_doctor_last_1_year',
       'cholesterol_level', 'daily_avg_steps', 'age', 'heart_decs_history',
       'other_major_decs_history', 'Gender', 'avg_glucose_level', 'bmi',
       'smoking_status', 'Year_last_admitted', 'weight',
       'covered_by_any_other_company', 'Alcohol', 'exercise',
       'weight_change_in_last_one_year', 'fat_percentage', 'insurance_cost'],
      dtype='object')

In [None]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib



df_model = df.copy()

cat_cols = df_model.select_dtypes(include='object').columns
le_dict = {}       

for col in cat_cols:
    le = LabelEncoder()
    df_model[col] = le.fit_transform(df_model[col].astype(str))
    le_dict[col] = le  
print("Label Encoding Completed")




num_cols = df_model.select_dtypes(include=['int64','float64']).columns
num_cols = num_cols.drop("insurance_cost", errors='ignore')

scaler = StandardScaler()
df_model[num_cols] = scaler.fit_transform(df_model[num_cols])

print("Standard Scaling Completed")




X = df_model.drop("insurance_cost", axis=1)
y = df_model["insurance_cost"]


feature_order = X.columns.tolist()


joblib.dump(feature_order, "feature_order.pkl")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)




models = {}


lr = LinearRegression()
lr.fit(X_train, y_train)
models["Linear Regression"] = lr


svr = SVR()
svr.fit(X_train, y_train)
models["SVR"] = svr

print("Models Trained Successfully")




results = {}

for name, model in models.items():
    pred = model.predict(X_test)
    results[name] = {
        "R2 Score": r2_score(y_test, pred),
        "MAE": mean_absolute_error(y_test, pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, pred))
    }

results_df = pd.DataFrame(results).T
print("\nModel Performance:")
print(results_df)




best_model_name = results_df["R2 Score"].idxmax()
best_model = models[best_model_name]

print(f"\nBEST MODEL SELECTED: {best_model_name}")



joblib.dump(best_model, "best_insurance_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le_dict, "label_encoders.pkl")
joblib.dump(feature_order, "feature_order.pkl")

print("\nModel, Scaler, Encoders & Feature Order Saved Successfully")


Label Encoding Completed
Standard Scaling Completed
Models Trained Successfully

Model Performance:
                   R2 Score           MAE          RMSE
Linear Regression  0.941142   2844.378985   3562.611533
SVR                0.027841  11892.558165  14478.827279

BEST MODEL SELECTED: Linear Regression

Model, Scaler, Encoders & Feature Order Saved Successfully!
