In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost import plot_importance
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import mlflow
import mlflow.sklearn
import mlflow.models
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
data = pd.read_csv("CSV/train.csv")
test_data = pd.read_csv("CSV/test.csv")

In [3]:
OHE_data = pd.get_dummies(data,columns = ['Marital Status','Occupation','Location','Property Type'],prefix=['Marital Status','Occupation','Location','Property Type'])
#test data
OHE_test_data = pd.get_dummies(test_data,columns = ['Marital Status','Occupation','Location','Property Type'],prefix=['Marital Status','Occupation','Location','Property Type'])

In [4]:
na_features = []
for col in data.columns:
    if data[col].isnull().any():
        na_features.append(col)
        
na_numerical_features = []
na_categorical_features = []

for col in na_features:
    if data[col].dtype == 'O':
        na_categorical_features.append(col)
    else:
        na_numerical_features.append(col)

In [5]:
data['Age_mean'] = data['Age'].fillna(data['Age'].mean())
data = data.drop('Age_mean',axis=1)

In [6]:
for col in na_numerical_features:
    if (data[col].skew() <= 0.5) and (data[col].skew() >= -0.5) :
        data[col] = data[col].fillna(data[col].mean())
        test_data[col] = test_data[col].fillna(test_data[col].mean())
    else:
        data[col] = data[col].fillna(data[col].median())
        test_data[col] = test_data[col].fillna(test_data[col].median())

for col in na_categorical_features:
    data[col] = data[col].fillna(data[col].mode()[0])
    test_data[col] = test_data[col].fillna(test_data[col].mode()[0])

In [7]:

#train data
OHE_data = pd.get_dummies(data,columns = ['Marital Status','Occupation','Location','Property Type'],prefix=['Marital Status','Occupation','Location','Property Type'])
#test data
OHE_test_data = pd.get_dummies(test_data,columns = ['Marital Status','Occupation','Location','Property Type'],prefix=['Marital Status','Occupation','Location','Property Type'])



#mentioning order for ordinal relationship
categories = [
    ['High School',"Bachelor's","Master's",'PhD'],
    ['Basic', 'Comprehensive', 'Premium'],
    ['Poor', 'Average', 'Good'],
    ['Rarely', 'Monthly', 'Weekly', 'Daily']
    ]

ordinal_encoder = OrdinalEncoder(categories=categories)
#train data
OHE_data[['Education Level','Policy Type', 'Customer Feedback', 'Exercise Frequency']] = ordinal_encoder.fit_transform(OHE_data[['Education Level','Policy Type', 'Customer Feedback', 'Exercise Frequency']])

#test data
OHE_test_data[['Education Level','Policy Type', 'Customer Feedback', 'Exercise Frequency']] = ordinal_encoder.fit_transform(OHE_test_data[['Education Level','Policy Type', 'Customer Feedback', 'Exercise Frequency']])

#train data
OHE_data['Smoking Status'] = np.where(OHE_data['Smoking Status'] == 'Yes',1,0)
OHE_data['Gender'] = np.where(OHE_data['Gender'] == 'Male',1,0)

#test data
OHE_test_data['Smoking Status'] = np.where(OHE_test_data['Smoking Status'] == 'Yes',1,0)
OHE_test_data['Gender'] = np.where(OHE_test_data['Gender'] == 'Male',1,0)

from sklearn.preprocessing import MinMaxScaler

scalar = MinMaxScaler()

#train data
OHE_data[['Age','Annual Income','Number of Dependents','Health Score','Previous Claims','Vehicle Age','Credit Score','Insurance Duration']] = scalar.fit_transform(OHE_data[['Age','Annual Income','Number of Dependents','Health Score','Previous Claims','Vehicle Age','Credit Score','Insurance Duration']])

#test data
OHE_test_data[['Age','Annual Income','Number of Dependents','Health Score','Previous Claims','Vehicle Age','Credit Score','Insurance Duration']] = scalar.fit_transform(OHE_test_data[['Age','Annual Income','Number of Dependents','Health Score','Previous Claims','Vehicle Age','Credit Score','Insurance Duration']])


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

class DataPreprocessor:
    def __init__(self):
        # Define the original categories for Ordinal Encoding
        self.ordinal_categories = [
            ['High School', "Bachelor's", "Master's", 'PhD'],  # Education Level
            ['Basic', 'Comprehensive', 'Premium'],  # Policy Type
            ['Poor', 'Average', 'Good'],  # Customer Feedback
            ['Rarely', 'Monthly', 'Weekly', 'Daily']  # Exercise Frequency
        ]
        
        # Initialize encoders and scalers
        self.ordinal_encoder = OrdinalEncoder(categories=self.ordinal_categories)
        self.scaler = MinMaxScaler()
        
        # Store original one-hot encoded column names for decoding
        self.one_hot_columns = ['Marital Status', 'Occupation', 'Location', 'Property Type']

    def inverse_transform(self, df_encoded):
        """
        Reverts the transformations applied to the data to obtain human-readable predictions.
        """

        # 1️⃣ **Inverse Min-Max Scaling** (Numerical Features)
        numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                             'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
        df_encoded[numerical_columns] = self.scaler.inverse_transform(df_encoded[numerical_columns])

        # 2️⃣ **Inverse Binary Encoding** (`Gender` and `Smoking Status`)
        df_encoded['Smoking Status'] = df_encoded['Smoking Status'].map({1: 'Yes', 0: 'No'})
        df_encoded['Gender'] = df_encoded['Gender'].map({1: 'Male', 0: 'Female'})

        # 3️⃣ **Inverse Ordinal Encoding**
        df_encoded[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']] = \
            self.ordinal_encoder.inverse_transform(df_encoded[['Education Level', 'Policy Type', 
                                                               'Customer Feedback', 'Exercise Frequency']])

        # 4️⃣ **Inverse One-Hot Encoding**
        decoded_ohe = self._inverse_one_hot(df_encoded, self.one_hot_columns)
        df_encoded = df_encoded.drop(columns=[col for col in df_encoded.columns if any(col.startswith(prefix) for prefix in self.one_hot_columns)])
        df_encoded = pd.concat([df_encoded, decoded_ohe], axis=1)

        return df_encoded

    def _inverse_one_hot(self, df, original_columns):
        """
        Helper function to revert One-Hot Encoding.
        """
        decoded_df = pd.DataFrame()
        for col in original_columns:
            matched_cols = [c for c in df.columns if c.startswith(col + '_')]
            decoded_df[col] = df[matched_cols].idxmax(axis=1).str[len(col) + 1:]
        return decoded_df


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

class DataPreprocessor:
    def __init__(self, train_data):
        """
        Initializes encoders using training data.
        """
        self.ordinal_categories = [
            ['High School', "Bachelor's", "Master's", 'PhD'],  # Education Level
            ['Basic', 'Comprehensive', 'Premium'],  # Policy Type
            ['Poor', 'Average', 'Good'],  # Customer Feedback
            ['Rarely', 'Monthly', 'Weekly', 'Daily']  # Exercise Frequency
        ]

        # ✅ FIX: Handle unknown categories by encoding them as -1
        self.ordinal_encoder = OrdinalEncoder(categories=self.ordinal_categories,
                                              handle_unknown='use_encoded_value', unknown_value=-1)
        
        # Fill missing values with a default category before fitting
        train_data[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']] = \
            train_data[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']].fillna("Unknown")

        self.ordinal_encoder.fit(train_data[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']])
        
        # Initialize MinMaxScaler & fit on numerical columns
        self.scaler = MinMaxScaler()
        numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                             'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
        self.scaler.fit(train_data[numerical_columns])

        # Store one-hot encoded column prefixes
        self.one_hot_columns = ['Marital Status', 'Occupation', 'Location', 'Property Type']
        self.ohe_columns = [col for col in train_data.columns if any(col.startswith(prefix) for prefix in self.one_hot_columns)]

    def inverse_transform(self, df_encoded):
        """
        Converts encoded predictions back to human-readable values.
        """
        df = df_encoded.copy()  # Avoid modifying original data

        # 1️⃣ **Inverse Min-Max Scaling (Numerical Features)**
        numerical_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                             'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
        df[numerical_columns] = self.scaler.inverse_transform(df[numerical_columns])

        # 2️⃣ **Inverse Binary Encoding (Gender & Smoking Status)**
        df['Smoking Status'] = df['Smoking Status'].map({1: 'Yes', 0: 'No'})
        df['Gender'] = df['Gender'].map({1: 'Male', 0: 'Female'})

        # 3️⃣ **Inverse Ordinal Encoding**
        df[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']] = \
            self.ordinal_encoder.inverse_transform(df[['Education Level', 'Policy Type', 
                                                       'Customer Feedback', 'Exercise Frequency']])

        # 4️⃣ **Inverse One-Hot Encoding**
        decoded_ohe = self._inverse_one_hot(df)
        df.drop(columns=self.ohe_columns, inplace=True, errors='ignore')
        df = pd.concat([df, decoded_ohe], axis=1)

        return df

    def _inverse_one_hot(self, df):
        """
        Helper function to revert One-Hot Encoding to original categorical values.
        """
        decoded_df = pd.DataFrame()
        for col in self.one_hot_columns:
            matched_cols = [c for c in df.columns if c.startswith(col + '_')]
            if matched_cols:
                decoded_df[col] = df[matched_cols].idxmax(axis=1).str[len(col) + 1:]
        return decoded_df


In [10]:
# Load training data (ensure it contains all necessary columns)
data = pd.read_csv("CSV/train.csv")  

# Initialize the preprocessor with training data
preprocessor = DataPreprocessor(data)

In [11]:
import pandas as pd
import numpy as np

def transform_input(raw_data, preprocessor):
    expected_columns = [
        'Age', 'Gender', 'Annual Income', 'Marital Status', 'Education Level', 
        'Occupation', 'Health Score', 'Location', 'Policy Type', 'Number of Dependents',
        'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 
        'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type'
    ]

    # ✅ Fix: Ensure Input Matches Expected Columns
    assert len(raw_data) == len(expected_columns), f"❌ Expected {len(expected_columns)} values, got {len(raw_data)}"

    # ✅ Create DataFrame with Correct Column Names
    df = pd.DataFrame([raw_data], columns=expected_columns)

    # ✅ Convert 'Gender' and 'Smoking Status' to Binary
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Smoking Status'] = df['Smoking Status'].map({'Yes': 1, 'No': 0})

    # ✅ Fix: Apply Ordinal Encoding with `handle_unknown`
    ordinal_cols = ['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']
    df[ordinal_cols] = preprocessor.ordinal_encoder.transform(df[ordinal_cols])

    # ✅ Apply One-Hot Encoding
    df = pd.get_dummies(df, columns=['Marital Status', 'Occupation', 'Location', 'Property Type'])

    # ✅ Ensure One-Hot Encoded Columns Match Training Data
    for col in preprocessor.ohe_columns:
        if col not in df.columns:
            df[col] = 0  

    # ✅ Convert to Float Before MinMax Scaling
    df = df.astype(float)

    # ✅ Apply MinMax Scaling
    df[preprocessor.scaler.feature_names_in_] = preprocessor.scaler.transform(df[preprocessor.scaler.feature_names_in_])

    return df


In [12]:
# # Example input row
# input_data = [
#     19, 'Female', 10049, 'Married', "Bachelor's", 'Self-Employed', 
#     22.59876067, 'Urban', 'Premium', 2, 17, 372, 5,  
#     700,  # ✅ Added Credit Score
#     'Poor', 'No', 'Weekly', 'House'
# ]

# # Transform input for model
# encoded_input = transform_input(input_data, preprocessor)

# # 🔹 **Pass encoded input to model for prediction**
# predicted_premium = model.predict(encoded_input)  # Example: [2900]

# # 🔹 **Convert back to human-readable format**
# decoded_output = preprocessor.inverse_transform(encoded_input)
# decoded_output['Predicted Premium'] = predicted_premium

# print(decoded_output)


In [13]:
# Load training data (ensure it contains all necessary columns)
train_data = pd.read_csv("CSV/train.csv")  

# Initialize the preprocessor with training data
preprocessor = DataPreprocessor(train_data)

In [34]:
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
import pandas as pd
import numpy as np

# ✅ Define expected input columns (excluding 'Policy Start Date')
expected_columns = [
    'Age', 'Gender', 'Annual Income', 'Marital Status', 'Education Level', 'Occupation', 
    'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 
    'Credit Score', 'Insurance Duration', 'Number of Dependents', 'Customer Feedback', 
    'Smoking Status', 'Exercise Frequency', 'Property Type'
]

def transform_input(raw_data, ordinal_encoder, scalar, expected_model_columns):
    """Transforms raw user input into model-ready format."""
    
    assert len(raw_data) == len(expected_columns), f"❌ Expected {len(expected_columns)} values, got {len(raw_data)}"

    # ✅ Create DataFrame with Correct Column Names
    df = pd.DataFrame([raw_data], columns=expected_columns)

    # ✅ Ensure categorical columns are strings before encoding
    categorical_cols = ['Marital Status', 'Occupation', 'Location', 'Property Type']
    df[categorical_cols] = df[categorical_cols].astype(str)

    # ✅ Apply One-Hot Encoding
    df_encoded = pd.get_dummies(df, columns=categorical_cols, 
                                prefix=categorical_cols)

    # ✅ Apply Ordinal Encoding first!
    ordinal_cols = ['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']
    df_encoded[ordinal_cols] = ordinal_encoder.transform(df_encoded[ordinal_cols])

    # ✅ Encode binary columns before reordering
    df_encoded['Smoking Status'] = np.where(df_encoded['Smoking Status'] == 'Yes', 1, 0)
    df_encoded['Gender'] = np.where(df_encoded['Gender'] == 'Male', 1, 0)

    # ✅ Ensure all expected one-hot encoded columns exist
    for col in expected_model_columns:
        if col not in df_encoded.columns:
            df_encoded[col] = 0  # Add missing columns with default value 0

    # ✅ Reorder columns to match model training data
    df_encoded = df_encoded.reindex(columns=expected_model_columns)

    # ✅ Apply MinMax Scaling
    numerical_columns_scaled = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                                'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
    df_encoded[numerical_columns_scaled] = scalar.transform(df_encoded[numerical_columns_scaled])

    return df_encoded

# ✅ Example Input Data (Remove 'Policy Start Date')
input_data_corrected = [  
    40, 'Female', 123751, 'Single', "Master's", 'Self-Employed',  
    24.9553, 'Suburban', 'Premium', 0, 8, 420, 2, 2, 'Good', 'Yes', 'Rarely', 'Condo'
]

# ✅ Fit MinMaxScaler
scalar = MinMaxScaler()
required_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
scalar.fit(OHE_data[required_columns])

# ✅ Fit Ordinal Encoder on Training Data
categories = [
    ['High School', "Bachelor's", "Master's", 'PhD'],  # Education Level
    ['Basic', 'Comprehensive', 'Premium'],  # Policy Type
    ['Poor', 'Average', 'Good'],  # Customer Feedback
    ['Rarely', 'Monthly', 'Weekly', 'Daily']  # Exercise Frequency
]
ordinal_encoder = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(OHE_data[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']].astype(str))

# ✅ Define expected columns for model input
expected_model_columns = [
    'Age', 'Gender', 'Annual Income', 'Education Level', 'Health Score', 'Policy Type', 
    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Number of Dependents', 
    'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 
    'Marital Status_Divorced', 'Marital Status_Married', 'Marital Status_Single', 
    'Occupation_Employed', 'Occupation_Self-Employed', 'Occupation_Unemployed', 
    'Location_Rural', 'Location_Suburban', 'Location_Urban', 
    'Property Type_Apartment', 'Property Type_Condo', 'Property Type_House'
]
# print("Model expects:", model.feature_names_in_)
# print("Input columns:", encoded_input.columns.tolist())



# ✅ Transform input
encoded_input = transform_input(input_data_corrected, ordinal_encoder, scalar, expected_model_columns)

# ✅ Ensure correct order before prediction
encoded_input = encoded_input[[
    'Age', 'Gender', 'Annual Income', 'Number of Dependents', 'Education Level', 'Health Score',
    'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
    'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Marital Status_Divorced',
    'Marital Status_Married', 'Marital Status_Single', 'Occupation_Employed', 'Occupation_Self-Employed',
    'Occupation_Unemployed', 'Location_Rural', 'Location_Suburban', 'Location_Urban',
    'Property Type_Apartment', 'Property Type_Condo', 'Property Type_House'
]]

# ✅ Convert to float (important for model compatibility)
encoded_input = encoded_input.astype(np.float32)

# ✅ Predict premium
predicted_premium = model.predict(encoded_input)
# print(f"Final Predicted Premium: {predicted_premium[0]:.2f}")
missing_from_input = [col for col in model.feature_names_in_ if col not in encoded_input.columns]
extra_in_input = [col for col in encoded_input.columns if col not in model.feature_names_in_]
print("❌ Missing from input:", missing_from_input)
print("❌ Extra in input:", extra_in_input)

for col in missing_from_input:
    encoded_input[col] = 0  # Add missing columns with default 0
# ✅ Predict premium
predicted_premium = model.predict(encoded_input)
print(f"Final Predicted Premium: {predicted_premium[0]:.2f}")


❌ Missing from input: []
❌ Extra in input: []
Final Predicted Premium: 2225.68


In [38]:

print(encoded_input)

    Age  Gender  Annual Income  Number of Dependents  Education Level  \
0  40.0     0.0       123751.0                   2.0              2.0   

   Health Score  Policy Type  Previous Claims  Vehicle Age  Credit Score  ...  \
0     24.955299          2.0              0.0          8.0         420.0  ...   

   Marital Status_Single  Occupation_Employed  Occupation_Self-Employed  \
0                    1.0                  0.0                       1.0   

   Occupation_Unemployed  Location_Rural  Location_Suburban  Location_Urban  \
0                    0.0             0.0                1.0             0.0   

   Property Type_Apartment  Property Type_Condo  Property Type_House  
0                      0.0                  1.0                  0.0  

[1 rows x 26 columns]


In [22]:
y_train = OHE_data['Premium Amount']
x_train = OHE_data.drop(['Premium Amount', 'id', 'Policy Start Date'], axis=1)
model = xgb.XGBRegressor()
model.fit(x_train, y_train)


In [None]:
import xgboost as xgb
import pickle
import pandas as pd

# Load trained model (if saved)
# model = pickle.load(open("trained_model.pkl", "rb"))

y_train = OHE_data['Premium Amount']
x_train = OHE_data.drop(['Premium Amount', 'id', 'Policy Start Date'], axis=1)

# Train a new XGBoost model (if not loaded)
model = xgb.XGBRegressor()
model.fit(x_train, y_train)

# ✅ Example Input Data (Remove 'Policy Start Date')
input_data_corrected = [  
    40, 'Female', 123751, 'Single', "Master's", 'Self-Employed',  
    24.9553, 'Suburban', 'Premium', 0, 8, 420, 2, 2, 'Good', 'Yes', 'Rarely', 'Condo'
]

# Transform input for model
encoded_input = transform_input(input_data_corrected, ordinal_encoder, scalar)

# ✅ Ensure all required columns exist in `encoded_input`
missing_cols = set(x_train.columns) - set(encoded_input.columns)
for col in missing_cols:
    print(f"⚠️ Missing column: {col}. Adding with default value 0.")
    encoded_input[col] = 0  # Add missing columns with 0s

# ✅ Ensure column order matches training data
encoded_input = encoded_input[x_train.columns]

# Make Prediction
predicted_premium = model.predict(encoded_input)
print(f"Predicted Premium: {predicted_premium[0]}")


In [46]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Load Test Data
test_data = pd.read_csv("CSV/train.csv")

# ✅ Define Expected Columns (Excluding 'Policy Start Date')
expected_columns = [
    'Age', 'Gender', 'Annual Income', 'Marital Status', 'Education Level', 'Occupation', 
    'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 
    'Credit Score', 'Insurance Duration', 'Number of Dependents', 'Customer Feedback', 
    'Smoking Status', 'Exercise Frequency', 'Property Type'
]

# ✅ Fit MinMaxScaler on Training Data
scaler = MinMaxScaler()
required_columns = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                    'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']
scaler.fit(OHE_data[required_columns])  # Fit using training data

# ✅ Fit Ordinal Encoder on Training Data
categories = [
    ['High School', "Bachelor's", "Master's", 'PhD'],  # Education Level
    ['Basic', 'Comprehensive', 'Premium'],  # Policy Type
    ['Poor', 'Average', 'Good'],  # Customer Feedback
    ['Rarely', 'Monthly', 'Weekly', 'Daily']  # Exercise Frequency
]
ordinal_encoder = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(OHE_data[['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']].astype(str))

# ✅ Define Expected Model Columns (Order Matters!)
expected_model_columns = [
    'Age', 'Gender', 'Annual Income', 'Number of Dependents', 'Education Level', 'Health Score',
    'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
    'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Marital Status_Divorced',
    'Marital Status_Married', 'Marital Status_Single', 'Occupation_Employed', 'Occupation_Self-Employed',
    'Occupation_Unemployed', 'Location_Rural', 'Location_Suburban', 'Location_Urban',
    'Property Type_Apartment', 'Property Type_Condo', 'Property Type_House'
]


# ✅ Apply Transformation to Test Data
X_test = transform_input(test_data, ordinal_encoder, scaler, expected_model_columns)

# ✅ Convert to float (important for XGBoost)
X_test = X_test.astype(np.float32)

# ✅ Load Ground Truth Values (Assuming 'Premium Amount' is in Test Data)
y_test = test_data['Premium Amount']

# ✅ Make Predictions
y_pred = model.predict(X_test)

# ✅ Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Model Evaluation on `test.csv`:")
print(f"✔ Mean Absolute Error (MAE): {mae:.2f}")
print(f"✔ Mean Squared Error (MSE): {mse:.2f}")
print(f"✔ R² Score: {r2:.4f}")


📊 Model Evaluation on `test.csv`:
✔ Mean Absolute Error (MAE): 3235.71
✔ Mean Squared Error (MSE): 11890529.44
✔ R² Score: -14.8917


In [52]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ✅ Load Data
df = pd.read_csv("CSV/train.csv")  # Change this to your actual dataset


# ✅ Drop Unnecessary Columns
df = df.drop(['id', 'Policy Start Date'], axis=1)

# ✅ Define Categorical & Numerical Columns
categorical_cols = ['Marital Status', 'Occupation', 'Location', 'Property Type']
ordinal_cols = ['Education Level', 'Policy Type', 'Customer Feedback', 'Exercise Frequency']
numerical_cols = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
                  'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']

# ✅ Define Ordinal Encoding Categories
ordinal_categories = [
    ['High School', "Bachelor's", "Master's", 'PhD'],  # Education Level
    ['Basic', 'Comprehensive', 'Premium'],  # Policy Type
    ['Poor', 'Average', 'Good'],  # Customer Feedback
    ['Rarely', 'Monthly', 'Weekly', 'Daily']  # Exercise Frequency
]

# ✅ Encode Binary Features
df['Smoking Status'] = np.where(df['Smoking Status'] == 'Yes', 1, 0)
df['Gender'] = np.where(df['Gender'] == 'Male', 1, 0)

# ✅ Train-Test Split
X = df.drop(columns=['Premium Amount'])  # Features
y = df['Premium Amount']  # Target Variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Fit Encoders & Scalers on Training Data
ordinal_encoder = OrdinalEncoder(categories=ordinal_categories, handle_unknown='use_encoded_value', unknown_value=-1)
ordinal_encoder.fit(X_train[ordinal_cols].astype(str))

scaler = MinMaxScaler()
scaler.fit(X_train[numerical_cols])

# ✅ Transform Data Function
def transform_input(df):
    df_encoded = df.copy()
    
    # Apply One-Hot Encoding
    df_encoded = pd.get_dummies(df_encoded, columns=categorical_cols, prefix=categorical_cols)
    
    # Apply Ordinal Encoding
    df_encoded[ordinal_cols] = ordinal_encoder.transform(df_encoded[ordinal_cols])
    
    # Apply Scaling
    df_encoded[numerical_cols] = scaler.transform(df_encoded[numerical_cols])
    
    # Ensure all expected columns exist
    for col in expected_columns:
        if col not in df_encoded.columns:
            df_encoded[col] = 0  # Fill missing columns with 0
    
    # Reorder columns
    df_encoded = df_encoded[expected_columns]
    
    return df_encoded

# ✅ Define Expected Columns (After Encoding)
expected_columns = [
    'Age', 'Gender', 'Annual Income', 'Number of Dependents', 'Education Level', 'Health Score',
    'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
    'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Marital Status_Divorced',
    'Marital Status_Married', 'Marital Status_Single', 'Occupation_Employed', 'Occupation_Self-Employed',
    'Occupation_Unemployed', 'Location_Rural', 'Location_Suburban', 'Location_Urban',
    'Property Type_Apartment', 'Property Type_Condo', 'Property Type_House'
]

# ✅ Preprocess Train & Test Data
X_train_encoded = transform_input(X_train)
X_test_encoded = transform_input(X_test)

# ✅ Train XGBoost Model
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)
model.fit(X_train_encoded, y_train)

# ✅ Predict on Test Set
y_pred = model.predict(X_test_encoded)

# ✅ Evaluate Model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"📊 Model Evaluation:")
print(f"✔ Mean Absolute Error (MAE): {mae:.2f}")
print(f"✔ Mean Squared Error (MSE): {mse:.2f}")
print(f"✔ R² Score: {r2:.4f}")

# ✅ Example Input Data (Remove 'Policy Start Date')
input_data_corrected = pd.DataFrame([{
    'Age': 40, 'Gender': 'Female', 'Annual Income': 123751, 'Marital Status': 'Single',
    'Education Level': "Master's", 'Occupation': 'Self-Employed', 'Health Score': 24.9553,
    'Location': 'Suburban', 'Policy Type': 'Premium', 'Previous Claims': 0, 'Vehicle Age': 8,
    'Credit Score': 420, 'Insurance Duration': 2, 'Number of Dependents': 2,
    'Customer Feedback': 'Good', 'Smoking Status': 'Yes', 'Exercise Frequency': 'Rarely',
    'Property Type': 'Condo'
}])

# ✅ Transform input for model
encoded_input = transform_input(input_data_corrected)

# ✅ Ensure column order matches training data
encoded_input = encoded_input[X_train_encoded.columns]

# Make Prediction
predicted_premium = model.predict(encoded_input)
print(f"💰 Predicted Premium: {predicted_premium[0]:.2f}")



📊 Model Evaluation:
✔ Mean Absolute Error (MAE): 637.66
✔ Mean Squared Error (MSE): 708317.44
✔ R² Score: 0.0521


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Gender: object, Smoking Status: object