In [18]:
# %reset

In [19]:
# from google.colab import drive
# drive.mount('/content/drive')

In [20]:
import numpy as np
import pandas as pd

In [21]:
df = pd.read_csv('insurance.csv')
df.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

# Load the trained XGBoost model
model_path = "best_xgboost_model.json"

try:
    Insurance_XGB = xgb.Booster()
    Insurance_XGB.load_model(model_path)
    print("XGBoost model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# Define categorical features
categorical_features = ['sex', 'smoker', 'region']

# Simulated training data to fit the encoder
df = pd.DataFrame([
    {'sex': 'male', 'smoker': 'yes', 'region': 'northeast'},
    {'sex': 'female', 'smoker': 'no', 'region': 'southwest'},
    {'sex': 'male', 'smoker': 'no', 'region': 'southeast'},
    {'sex': 'female', 'smoker': 'yes', 'region': 'northwest'}
])  # Use actual training data

# Fit OneHotEncoder (as used during training)
encode = OneHotEncoder(drop='first', sparse_output=False)
df_encoded_columns = encode.fit_transform(df[categorical_features])
encoded_df = pd.DataFrame(df_encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Combine encoded categorical and numerical features
enhanced_df = pd.concat([df.drop(categorical_features, axis=1), encoded_df], axis=1)

# Replace infinite values with NaN
enhanced_df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Log transformation of target variable (assumed as 'charges')
# Ensure 'charges' column exists in df before applying log1p
if 'charges' in enhanced_df.columns:
    New_target = np.log1p(enhanced_df['charges'])

# ---- User Input Processing ----
user_input = {
    'age': 30,
    'sex': 'female',
    'bmi': 25,
    'children': 2,
    'smoker': 'no',
    'region': 'southwest'
}

# Convert user input to DataFrame
user_df = pd.DataFrame([user_input])

# Apply OneHotEncoding on user input
user_encoded_columns = encode.transform(user_df[categorical_features])
user_encoded_df = pd.DataFrame(user_encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Merge with numerical features
X_user = np.concatenate([user_df.drop(categorical_features, axis=1).values, user_encoded_df.values], axis=1)

# Convert to DMatrix for XGBoost
dtest = xgb.DMatrix(X_user)

# Predict using XGBoost model
log_predicted_charge = Insurance_XGB.predict(dtest)

# Reverse log1p transformation
predicted_charge = np.expm1(log_predicted_charge)

print(f"Predicted Insurance Charges: {predicted_charge[0]:.2f}")


XGBoost model loaded successfully!
Predicted Insurance Charges: 5100.25


In [23]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

def compare_predictions_xgb(model_path, csv_path):
    """
    Loads a trained XGBoost model and dataset, applies the same encoding,
    makes predictions, and compares actual vs. predicted charges.

    Args:
    - model_path (str): Path to the trained XGBoost model (.json).
    - csv_path (str): Path to the CSV file containing insurance data.

    Returns:
    - DataFrame with actual vs. predicted values and error metrics.
    """

    try:
        # Load trained XGBoost model
        Insurance_XGB = xgb.Booster()
        Insurance_XGB.load_model(model_path)
        print("XGBoost model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Load dataset
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Column '{col}' is missing from dataset.")
            return

    # Define categorical features
    categorical_features = ['sex', 'smoker', 'region']

    # Fit OneHotEncoder on categorical columns
    encode = OneHotEncoder(drop='first', sparse_output=False)
    encode.fit(df[categorical_features])

    # Apply One-Hot Encoding
    encoded_columns = encode.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

    # Prepare final input for model
    X = np.concatenate([df.drop(categorical_features + ['charges'], axis=1).values, encoded_df.values], axis=1)

    # Convert to XGBoost DMatrix
    dtest = xgb.DMatrix(X)

    # Predict log-transformed charges
    log_predicted_charges = Insurance_XGB.predict(dtest)

    # Reverse log1p transformation
    predicted_charges = np.expm1(log_predicted_charges)

    # Calculate error metrics
    df_results = df[['charges']].copy()
    df_results['Predicted_Charges'] = predicted_charges
    df_results['Difference'] = df_results['charges'] - df_results['Predicted_Charges']
    df_results['Percentage_Error'] = (abs(df_results['Difference']) / df_results['charges']) * 100

    # Display first 10 rows
    print("\nFirst 10 Predictions vs Actual Values:")
    print(df_results.head(10))

    # Calculate Mean Percentage Error
    mean_percentage_error = df_results['Percentage_Error'].mean()
    print(f"\nError Metrics:\nMean Percentage Error for entire dataset: {mean_percentage_error:.4f}%")



# Example usage:
model_path = "best_xgboost_model.json"
csv_path = "insurance.csv"

compare_predictions_xgb(model_path, csv_path)


XGBoost model loaded successfully!

First 10 Predictions vs Actual Values:
       charges  Predicted_Charges    Difference  Percentage_Error
0  16884.92400       17104.498047   -219.574047          1.300415
1   1725.55230        2063.000732   -337.448432         19.555967
2   4449.46200        4492.593750    -43.131750          0.969370
3  21984.47061        3892.601562  18091.869048         82.293858
4   3866.85520        3683.486328    183.368872          4.742067
5   3756.62160        3908.869629   -152.248029          4.052791
6   8240.58960        8278.104492    -37.514892          0.455245
7   7281.50560        6869.600098    411.905502          5.656873
8   6406.41070        7394.270996   -987.860296         15.419871
9  28923.13692       15719.511719  13203.625201         45.650737

Error Metrics:
Mean Percentage Error for entire dataset: 13.7164%


In [24]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

# Load the trained model
model_path = "DecisionTree_model.pkl"

try:
    Insurance_DecisionTree = joblib.load(model_path)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# Define categorical features
categorical_features = ['sex', 'smoker', 'region']

# Fit the exact same encoder used during training
encode = OneHotEncoder(drop='first', sparse_output=False)

# Simulating training data to refit the encoder
# Ensure `df` contains the same categorical values as in training
df = pd.DataFrame([
    {'sex': 'male', 'smoker': 'yes', 'region': 'northeast'},
    {'sex': 'female', 'smoker': 'no', 'region': 'southwest'},
    {'sex': 'male', 'smoker': 'no', 'region': 'southeast'},
    {'sex': 'female', 'smoker': 'yes', 'region': 'northwest'}
])  # Add real training data here

encode.fit(df[categorical_features])  # Fit on the same categories as training

# Define user input
user_input = {
    'age': 30,
    'sex': 'female',
    'bmi': 25,
    'children': 2,
    'smoker': 'no',
    'region': 'southwest'
}

# Convert input to DataFrame
user_df = pd.DataFrame([user_input])

# Apply one-hot encoding using the exact same encoder
encoded_columns = encode.transform(user_df[categorical_features])
encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Prepare final input for model (ensure same feature count)
X_user = np.concatenate([user_df.drop(categorical_features, axis=1).values, encoded_df.values], axis=1)

# Predict using the model
log_predicted_charge = Insurance_DecisionTree.predict(X_user)

# Reverse log1p transformation
predicted_charge = np.expm1(log_predicted_charge)

print(f"Predicted Insurance Charges: {predicted_charge[0]:.2f}")


Model loaded successfully!
Predicted Insurance Charges: 4794.62


In [25]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

def compare_predictions(model_path, csv_path):
    """
    Loads a trained model and dataset, applies the same encoding,
    makes predictions, and compares actual vs. predicted charges.

    Args:
    - model_path (str): Path to the trained model (DecisionTreeRegressor).
    - csv_path (str): Path to the CSV file containing insurance data.

    Returns:
    - DataFrame with actual vs. predicted values and error metrics.
    """

    try:
        # Load trained model
        Insurance_DecisionTree = joblib.load(model_path)
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Load dataset
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Column '{col}' is missing from dataset.")
            return

    # Define categorical features
    categorical_features = ['sex', 'smoker', 'region']

    # Fit the same OneHotEncoder (only on categorical columns from dataset)
    encode = OneHotEncoder(drop='first', sparse_output=False)
    encode.fit(df[categorical_features])

    # Apply One-Hot Encoding to dataset
    encoded_columns = encode.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

    # Prepare final input for model
    X = np.concatenate([df.drop(categorical_features + ['charges'], axis=1).values, encoded_df.values], axis=1)

    # Predict log-transformed charges
    log_predicted_charges = Insurance_DecisionTree.predict(X)

    # Reverse log1p transformation to get actual charge predictions
    predicted_charges = np.expm1(log_predicted_charges)

    # Calculate error metrics
    df_results = df[['charges']].copy()
    df_results['Predicted_Charges'] = predicted_charges
    df_results['Difference'] = df_results['charges'] - df_results['Predicted_Charges']
    df_results['Percentage_Error'] = (abs(df_results['Difference']) / df_results['charges']) * 100

    # Display first 10 rows
    print("\nFirst 10 Predictions vs Actual Values:")
    print(df_results.head(10))

    # Calculate Mean Percentage Error
    mean_percentage_error = df_results['Percentage_Error'].mean()
    print(f"\nError Metrics:\nMean Percentage Error for entire dataset: {mean_percentage_error:.4f}%")



# Example usage:
model_path = "DecisionTree_model.pkl"
csv_path = "insurance.csv"

compare_predictions(model_path, csv_path)


Model loaded successfully!

First 10 Predictions vs Actual Values:
       charges  Predicted_Charges    Difference  Percentage_Error
0  16884.92400       17019.576506   -134.652506          0.797472
1   1725.55230        2851.013038  -1125.460738         65.223218
2   4449.46200        6215.676092  -1766.214092         39.695003
3  21984.47061        3171.691610  18812.779000         85.573036
4   3866.85520        3171.691610    695.163590         17.977492
5   3756.62160        4021.102836   -264.481236          7.040401
6   8240.58960        7688.901525    551.688075          6.694765
7   7281.50560        6578.164564    703.341036          9.659280
8   6406.41070        6578.164564   -171.753864          2.680969
9  28923.13692       15405.032008  13518.104912         46.738032

Error Metrics:
Mean Percentage Error for entire dataset: 18.5234%


In [26]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

# Load the trained model
model_path = "RandomForest_model.pkl"

try:
    Insurance_DecisionTree = joblib.load(model_path)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# Define categorical features
categorical_features = ['sex', 'smoker', 'region']

# Fit the exact same encoder used during training
encode = OneHotEncoder(drop='first', sparse_output=False)

# Simulating training data to refit the encoder
# Ensure `df` contains the same categorical values as in training
df = pd.DataFrame([
    {'sex': 'male', 'smoker': 'yes', 'region': 'northeast'},
    {'sex': 'female', 'smoker': 'no', 'region': 'southwest'},
    {'sex': 'male', 'smoker': 'no', 'region': 'southeast'},
    {'sex': 'female', 'smoker': 'yes', 'region': 'northwest'}
])  # Add real training data here

encode.fit(df[categorical_features])  # Fit on the same categories as training

# Define user input
user_input = {
    'age': 30,
    'sex': 'female',
    'bmi': 25,
    'children': 2,
    'smoker': 'no',
    'region': 'southwest'
}

# Convert input to DataFrame
user_df = pd.DataFrame([user_input])

# Apply one-hot encoding using the exact same encoder
encoded_columns = encode.transform(user_df[categorical_features])
encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Prepare final input for model (ensure same feature count)
X_user = np.concatenate([user_df.drop(categorical_features, axis=1).values, encoded_df.values], axis=1)

# Predict using the model
log_predicted_charge = Insurance_DecisionTree.predict(X_user)

# Reverse log1p transformation
predicted_charge = np.expm1(log_predicted_charge)

print(f"Predicted Insurance Charges: {predicted_charge[0]:.2f}")


Model loaded successfully!
Predicted Insurance Charges: 5185.19


In [27]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

def compare_predictions(model_path, csv_path):
    """
    Loads a trained model and dataset, applies the same encoding,
    makes predictions, and compares actual vs. predicted charges.

    Args:
    - model_path (str): Path to the trained model (DecisionTreeRegressor).
    - csv_path (str): Path to the CSV file containing insurance data.

    Returns:
    - DataFrame with actual vs. predicted values and error metrics.
    """

    try:
        # Load trained model
        Insurance_DecisionTree = joblib.load(model_path)
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Load dataset
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Column '{col}' is missing from dataset.")
            return

    # Define categorical features
    categorical_features = ['sex', 'smoker', 'region']

    # Fit the same OneHotEncoder (only on categorical columns from dataset)
    encode = OneHotEncoder(drop='first', sparse_output=False)
    encode.fit(df[categorical_features])

    # Apply One-Hot Encoding to dataset
    encoded_columns = encode.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

    # Prepare final input for model
    X = np.concatenate([df.drop(categorical_features + ['charges'], axis=1).values, encoded_df.values], axis=1)

    # Predict log-transformed charges
    log_predicted_charges = Insurance_DecisionTree.predict(X)

    # Reverse log1p transformation to get actual charge predictions
    predicted_charges = np.expm1(log_predicted_charges)

    # Calculate error metrics
    df_results = df[['charges']].copy()
    df_results['Predicted_Charges'] = predicted_charges
    df_results['Difference'] = df_results['charges'] - df_results['Predicted_Charges']
    df_results['Percentage_Error'] = (abs(df_results['Difference']) / df_results['charges']) * 100

    # Display first 10 rows
    print("\nFirst 10 Predictions vs Actual Values:")
    print(df_results.head(10))

    # Calculate Mean Percentage Error
    mean_percentage_error = df_results['Percentage_Error'].mean()
    print(f"\nError Metrics:\nMean Percentage Error for entire dataset: {mean_percentage_error:.4f}%")



# Example usage:
model_path = "RandomForest_model.pkl"
csv_path = "insurance.csv"

compare_predictions(model_path, csv_path)


Model loaded successfully!

First 10 Predictions vs Actual Values:
       charges  Predicted_Charges    Difference  Percentage_Error
0  16884.92400       17261.037492   -376.113492          2.227511
1   1725.55230        2466.988902   -741.436602         42.968075
2   4449.46200        5018.092029   -568.630029         12.779748
3  21984.47061        4342.853659  17641.616951         80.245812
4   3866.85520        4111.509669   -244.654469          6.326962
5   3756.62160        3823.812062    -67.190462          1.788587
6   8240.58960        7880.687052    359.902548          4.367437
7   7281.50560        7021.273747    260.231853          3.573874
8   6406.41070        7453.978226  -1047.567526         16.351863
9  28923.13692       15300.957056  13622.179864         47.097865

Error Metrics:
Mean Percentage Error for entire dataset: 15.8983%


In [28]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the trained model
model_path = "LinearRegression_model.pkl"

try:
    Insurance_Model = joblib.load(model_path)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# Define categorical features
categorical_features = ['sex', 'smoker', 'region']

# Fit the exact same encoder used during training
encode = OneHotEncoder(drop='first', sparse_output=False)

# Simulating training data to refit the encoder
# Ensure `df` contains the same categorical values as in training
df = pd.DataFrame([
    {'sex': 'male', 'smoker': 'yes', 'region': 'northeast'},
    {'sex': 'female', 'smoker': 'no', 'region': 'southwest'},
    {'sex': 'male', 'smoker': 'no', 'region': 'southeast'},
    {'sex': 'female', 'smoker': 'yes', 'region': 'northwest'}
])  # Replace with real training data if available

encode.fit(df[categorical_features])  # Fit on the same categories as training

# Define user input
user_input = {
    'age': 10,
    'sex': 'female',
    'bmi': 45,
    'children': 2,
    'smoker': 'no',
    'region': 'southwest'
}

# Convert input to DataFrame
user_df = pd.DataFrame([user_input])

# Apply one-hot encoding using the exact same encoder
encoded_columns = encode.transform(user_df[categorical_features])
encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Combine numerical and encoded categorical features
X_user = np.concatenate([user_df.drop(categorical_features, axis=1).values, encoded_df.values], axis=1)

# Apply StandardScaler
scaler_linear = StandardScaler()
X_user_scaled = scaler_linear.fit_transform(X_user)  # Fitting and transforming for the user input

# Predict using the model
log_predicted_charge = Insurance_Model.predict(X_user_scaled)

# Reverse log1p transformation
predicted_charge = np.expm1(log_predicted_charge)

print(f"Predicted Insurance Charges: {predicted_charge[0]:.2f}")


Model loaded successfully!
Predicted Insurance Charges: 9023.47


In [29]:
%pip install joblib

import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def compare_predictions(model_path, csv_path):
    """
    Loads a trained model and dataset, applies the same encoding and scaling,
    makes predictions, and compares actual vs. predicted charges.

    Args:
    - model_path (str): Path to the trained model (LinearRegression or DecisionTreeRegressor).
    - csv_path (str): Path to the CSV file containing insurance data.

    Returns:
    - DataFrame with actual vs. predicted values and error metrics.
    """

    try:
        # Load trained model
        Insurance_Model = joblib.load(model_path)
        print("Model loaded successfully!")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Load dataset
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Column '{col}' is missing from dataset.")
            return

    # Define categorical features
    categorical_features = ['sex', 'smoker', 'region']

    # Fit OneHotEncoder (exact same categories as training)
    encode = OneHotEncoder(drop='first', sparse_output=False)
    encode.fit(df[categorical_features])

    # Apply One-Hot Encoding
    encoded_columns = encode.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

    # Prepare final input for model
    X = np.concatenate([df.drop(categorical_features + ['charges'], axis=1).values, encoded_df.values], axis=1)

    # Apply StandardScaler (same scaling as training)
    scaler_linear = StandardScaler()
    X_scaled = scaler_linear.fit_transform(X)

    # Predict log-transformed charges
    log_predicted_charges = Insurance_Model.predict(X_scaled)

    # Reverse log1p transformation to get actual charge predictions
    predicted_charges = np.expm1(log_predicted_charges)

    # Calculate error metrics
    df_results = df[['charges']].copy()
    df_results['Predicted_Charges'] = predicted_charges
    df_results['Difference'] = df_results['charges'] - df_results['Predicted_Charges']
    df_results['Percentage_Error'] = (abs(df_results['Difference']) / df_results['charges']) * 100

    # Display first 10 rows
    print("\nFirst 10 Predictions vs Actual Values:")
    print(df_results.head(10))

    # Calculate Mean Percentage Error
    mean_percentage_error = df_results['Percentage_Error'].mean()
    print(f"\nError Metrics:\nMean Percentage Error for entire dataset: {mean_percentage_error:.4f}%")



# Example usage:
model_path = "LinearRegression_model.pkl"
csv_path = "insurance.csv"

compare_predictions(model_path, csv_path)


Note: you may need to restart the kernel to use updated packages.
Model loaded successfully!

First 10 Predictions vs Actual Values:
       charges  Predicted_Charges    Difference  Percentage_Error
0  16884.92400       13714.197132   3170.726868         18.778449
1   1725.55230        2930.039986  -1204.487686         69.803024
2   4449.46200        4858.121050   -408.659050          9.184460
3  21984.47061        4322.081585  17662.389025         80.340297
4   3866.85520        4565.893212   -699.038012         18.077688
5   3756.62160        3971.490353   -214.868753          5.719734
6   8240.58960        8109.777262    130.812338          1.587415
7   7281.50560        7330.057214    -48.551614          0.666780
8   6406.41070        6814.816784   -408.406084          6.374959
9  28923.13692       12150.923748  16772.213172         57.988915

Error Metrics:
Mean Percentage Error for entire dataset: 27.1819%



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [30]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

# Paths to saved models
model_pathL = "insurance_Model.pkl"
poly_path = "Final_Poly_Transformer.pkl"
scaler_path = "Final_Scaler.pkl"

# Load the trained Linear Regression model
try:
    Insurance_Model = joblib.load(model_pathL)
    print("Linear Regression Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")

# Load the saved Polynomial Transformer
try:
    final_poly = joblib.load(poly_path)
    print("Polynomial Transformer loaded successfully!")
except Exception as e:
    print(f"Error loading Polynomial Transformer: {e}")

# Load the saved Standard Scaler
try:
    final_scaler = joblib.load(scaler_path)
    print("Scaler loaded successfully!")
except Exception as e:
    print(f"Error loading Scaler: {e}")

# Define categorical features
categorical_features = ['sex', 'smoker', 'region']

# Fit OneHotEncoder (same categories as training)
encode = OneHotEncoder(drop='first', sparse_output=False)
df = pd.DataFrame([
    {'sex': 'male', 'smoker': 'yes', 'region': 'northeast'},
    {'sex': 'female', 'smoker': 'no', 'region': 'southwest'},
    {'sex': 'male', 'smoker': 'no', 'region': 'southeast'},
    {'sex': 'female', 'smoker': 'yes', 'region': 'northwest'}
])
encode.fit(df[categorical_features])

# Define user input
user_input = {
    'age': 30,
    'sex': 'female',
    'bmi': 25,
    'children': 2,
    'smoker': 'no',
    'region': 'southwest'
}

# Convert input to DataFrame
user_df = pd.DataFrame([user_input])

# Apply one-hot encoding
encoded_columns = encode.transform(user_df[categorical_features])
encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

# Combine numerical and encoded categorical features
X_user = np.concatenate([user_df.drop(categorical_features, axis=1).values, encoded_df.values], axis=1)

# Apply Polynomial Transformation
X_user_poly = final_poly.transform(X_user)

# Apply StandardScaler Transformation
X_user_scaled = final_scaler.transform(X_user_poly)

# Predict using the trained model
log_predicted_charge = Insurance_Model.predict(X_user_scaled)

# Reverse log1p transformation
predicted_charge = np.expm1(log_predicted_charge)

print(f"Predicted Insurance Charges: {predicted_charge[0]:.2f}")


Linear Regression Model loaded successfully!
Polynomial Transformer loaded successfully!
Scaler loaded successfully!
Predicted Insurance Charges: 5146.61


In [31]:
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import OneHotEncoder

def compare_predictions(model_path, poly_path, scaler_path, csv_path):
    """
    Loads a trained model and dataset, applies PolynomialFeatures transformation and scaling,
    makes predictions, and compares actual vs. predicted charges.

    Args:
    - model_path (str): Path to the trained LinearRegression model.
    - poly_path (str): Path to the saved PolynomialFeatures transformer.
    - scaler_path (str): Path to the saved StandardScaler.
    - csv_path (str): Path to the CSV file containing insurance data.

    Returns:
    - DataFrame with actual vs. predicted values and error metrics.
    """

    try:
        # Load trained model
        Insurance_Model = joblib.load(model_path)
        print("Model loaded successfully!")

        # Load Polynomial Features Transformer
        final_poly = joblib.load(poly_path)
        print("Polynomial Transformer loaded successfully!")

        # Load StandardScaler
        final_scaler = joblib.load(scaler_path)
        print("Scaler loaded successfully!")

    except Exception as e:
        print(f"Error loading model components: {e}")
        return

    # Load dataset
    df = pd.read_csv(csv_path)

    # Ensure required columns exist
    required_columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges']
    for col in required_columns:
        if col not in df.columns:
            print(f"Error: Column '{col}' is missing from dataset.")
            return

    # Define categorical features
    categorical_features = ['sex', 'smoker', 'region']

    # Fit OneHotEncoder (use exact categories as in training)
    encode = OneHotEncoder(drop='first', sparse_output=False)
    encode.fit(df[categorical_features])

    # Apply One-Hot Encoding
    encoded_columns = encode.transform(df[categorical_features])
    encoded_df = pd.DataFrame(encoded_columns, columns=encode.get_feature_names_out(categorical_features))

    # Prepare final input for model (Numerical + Encoded Categorical)
    X = np.concatenate([df.drop(categorical_features + ['charges'], axis=1).values, encoded_df.values], axis=1)

    # Apply Polynomial Transformation
    X_poly = final_poly.transform(X)

    # Apply StandardScaler Transformation
    X_scaled = final_scaler.transform(X_poly)

    # Predict log-transformed charges
    log_predicted_charges = Insurance_Model.predict(X_scaled)

    # Reverse log1p transformation to get actual charge predictions
    predicted_charges = np.expm1(log_predicted_charges)

    # Calculate error metrics
    df_results = df[['charges']].copy()
    df_results['Predicted_Charges'] = predicted_charges
    df_results['Difference'] = df_results['charges'] - df_results['Predicted_Charges']
    df_results['Percentage_Error'] = (abs(df_results['Difference']) / df_results['charges']) * 100

    # Display first 10 rows
    print("\nFirst 10 Predictions vs Actual Values:")
    print(df_results.head(10))

    # Calculate Mean Percentage Error
    mean_percentage_error = df_results['Percentage_Error'].mean()
    print(f"\nError Metrics:\nMean Percentage Error for entire dataset: {mean_percentage_error:.4f}%")



# Example usage:
model_path = "insurance_Model.pkl"
poly_path = "Final_Poly_Transformer.pkl"
scaler_path = "Final_Scaler.pkl"
csv_path = "insurance.csv"

compare_predictions(model_path, poly_path, scaler_path, csv_path)


Model loaded successfully!
Polynomial Transformer loaded successfully!
Scaler loaded successfully!

First 10 Predictions vs Actual Values:
       charges  Predicted_Charges    Difference  Percentage_Error
0  16884.92400       22166.312318  -5281.388318         31.278721
1   1725.55230        2379.775698   -654.223398         37.913855
2   4449.46200        5440.443551   -990.981551         22.271941
3  21984.47061        4120.429611  17864.040999         81.257545
4   3866.85520        4218.150316   -351.295116          9.084776
5   3756.62160        4042.081221   -285.459621          7.598839
6   8240.58960        8786.579572   -545.989972          6.625618
7   7281.50560        8790.855058  -1509.349458         20.728535
8   6406.41070        7417.995352  -1011.584652         15.790194
9  28923.13692       13213.719680  15709.417240         54.314362

Error Metrics:
Mean Percentage Error for entire dataset: 17.9253%
