In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Load dataset
df = pd.read_csv("insurance-2.csv")  # Change filename if needed

# Assume 'charges' is the target variable
X = df.drop(columns=["charges"])  
y = df["charges"]  

# Convert categorical features into numerical using One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)  # ✅ FIX for categorical data

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regression": SVR()
}

# 📌 **Table 1: Model Performance Before Tuning**
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results.append([name, r2, mae, mse, rmse])

table1 = pd.DataFrame(results, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE"])

# ✅ Print Table 1 in a structured way
print("\n📌 Table 1: Model Performance (Before Tuning)\n")
print(table1.to_string(index=False))

# 📌 **Table 2: Model Performance After GridSearchCV & K-Fold CV**
param_grid = {
    "Ridge Regression": {"alpha": [0.1, 1, 10]},
    "Lasso Regression": {"alpha": [0.1, 1, 10]},
    "Random Forest": {"n_estimators": [100, 200], "max_depth": [10, 20]},
    "Gradient Boosting": {"n_estimators": [50, 100], "learning_rate": [0.01, 0.1]},
    "Support Vector Regression": {"C": [1, 10], "epsilon": [0.1, 0.5]}
}

results_tuned = []
for name, model in models.items():
    if name in param_grid:
        grid = GridSearchCV(model, param_grid[name], cv=5, scoring="r2", verbose=0)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = model.fit(X_train, y_train)
        best_params = "N/A"

    y_pred = best_model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    
    results_tuned.append([name, r2, mae, mse, rmse, best_params])

table2 = pd.DataFrame(results_tuned, columns=["Model", "R2 Score", "MAE", "MSE", "RMSE", "Best Params"])

# ✅ Format 'Best Params' for readability
table2["Best Params"] = table2["Best Params"].astype(str)

# ✅ Print Table 2 in a structured way
print("\n📌 Table 2: Model Performance (After GridSearchCV Tuning)\n")
print(table2.to_string(index=False))



📌 Table 1: Model Performance (Before Tuning)

                    Model  R2 Score         MAE          MSE         RMSE
        Linear Regression  0.783593 4181.194474 3.359692e+07  5796.284659
         Ridge Regression  0.783283 4193.585298 3.364504e+07  5800.434216
         Lasso Regression  0.783538 4182.426034 3.360551e+07  5797.025751
            Random Forest  0.863743 2584.099449 2.115370e+07  4599.315301
        Gradient Boosting  0.879257 2443.483262 1.874518e+07  4329.570011
Support Vector Regression -0.072423 8596.648704 1.664923e+08 12903.187975

📌 Table 2: Model Performance (After GridSearchCV Tuning)

                    Model  R2 Score         MAE          MSE         RMSE                                Best Params
        Linear Regression  0.783593 4181.194474 3.359692e+07  5796.284659                                        N/A
         Ridge Regression  0.783283 4193.585298 3.364504e+07  5800.434216                               {'alpha': 1}
         Lasso Regression

In [7]:
import pickle  
from sklearn.ensemble import GradientBoostingRegressor  

# Train the best model with optimal parameters
best_model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1)  
best_model.fit(X_train, y_train)  

# Save the trained model
with open("model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("✅ Model saved as model.pkl")


✅ Model saved as model.pkl
