# Importing the Required Libraries

In [3]:
# !pip install bayesian-optimization

In [4]:
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import  LabelEncoder

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import  LabelEncoder, MinMaxScaler

from sklearn.tree import DecisionTreeRegressor

from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score, root_mean_squared_log_error

from bayes_opt import BayesianOptimization

import pickle

import warnings 
warnings.filterwarnings('ignore')

In [5]:
train_file = r'C:\Users\sarav\Smart_Premium\Research_Data\train.csv'

In [6]:
data = pd.read_csv(train_file)

In [7]:
data.shape

(1200000, 21)

In [8]:
data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


# Data Preprocessing

In [9]:
data.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [10]:
numerical_features = data.select_dtypes(include = ['int64', 'float64']).columns
categorical_features = data.select_dtypes(include = 'object').columns

In [11]:
for col in numerical_features:
    data[col].fillna(data[col].mean(), inplace=True)
        
for col in categorical_features:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [12]:
data.isnull().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

In [13]:
num_col = data[numerical_features].columns
num_col

Index(['id', 'Age', 'Annual Income', 'Number of Dependents', 'Health Score',
       'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration',
       'Premium Amount'],
      dtype='object')

In [14]:
# Outlier Detection and Handling

def remove_outliers_iqr(df, columns):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

    return df

In [15]:
cleaned_data = remove_outliers_iqr(data.copy(), num_col)
cleaned_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,592.92435,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,96871.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [16]:
cleaned_data.to_csv('C:\\Users\\sarav\\Smart_Premium\\Research_Data\\Cleaned_data.csv', index = False)

In [17]:
# Encoding Data

def age_category(data):
    if 18 < data <= 30:
        return '18-30'
    elif 30 < data <= 40:
        return '31-40'
    elif 40 < data <= 50:
        return '41-50'
    elif 50 < data <= 64:
        return '51-64'
    else:
        return '<64'

In [18]:
def dependent_category(data):
    if data == 0:
        return '0'
    elif 0 < data <= 2:
        return '0-2'
    elif 2 < data <= 3:
        return '2-3'
    else:
        return '<3'

In [19]:
def health_category(data):
    if 0 < data <= 15:
        return '0-15'
    elif 15 < data <= 25:
        return '15-25'
    elif 25 < data <= 35:
        return '15-35'
    else:
        return '<35'

In [20]:
def claims(data):
    if 0 < data <= 1:
        return '0-1'
    elif 1 < data <= 2:
        return '1-2'
    else:
        return '<2'

In [21]:
def vehicle(data):
    if 0 < data <= 5:
        return '0-5'
    elif 5 < data <= 10:
        return '5-10'
    elif 10 < data <= 20:
        return '10-20'
    else:
        return '<20'

In [22]:
def credit(data):
    if 0 < data <= 300:
        return '0-300'
    elif 300 < data <= 600:
        return '300-600'
    elif 600 < data < 800:
        return '600-800'
    else:
        return '<800'

In [23]:
def insurance(data):
    if 0 < data <= 3:
        return '0-3'
    elif 3 < data <= 6:
        return '3-6'
    elif 6 < data < 9:
        return '6-9'
    else:
        return '<9'

In [24]:
cleaned_data['Age_Group'] = cleaned_data['Age'].apply(age_category)

cleaned_data['Dependent_Group'] = cleaned_data['Number of Dependents'].apply(dependent_category)

cleaned_data['Health_Group'] = cleaned_data['Health Score'].apply(health_category)

cleaned_data['Prev_Claims_Group'] = cleaned_data['Previous Claims'].apply(claims)

cleaned_data['Vehicle_Group'] = cleaned_data['Vehicle Age'].apply(vehicle)

cleaned_data['Credit_Group'] = cleaned_data['Credit Score'].apply(credit)

cleaned_data['Insurance_Group'] = cleaned_data['Insurance Duration'].apply(insurance)


In [25]:
mappings = {
    "Education Level":{"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Customer Feedback":{"Poor": 0, "Average": 1, "Good": 2},
    "Exercise Frequency":{"Rarely": 0, "Weekly": 1, "Monthly": 2, "Daily": 3 },
    "Policy Type":{"Basic": 0, "Comprehensive": 1, "Premium": 2}
}

In [26]:
cleaned_data.replace(mappings, inplace = True)

In [27]:
columns_to_encode = cleaned_data[['Age_Group', 'Dependent_Group', 'Health_Group', 'Prev_Claims_Group', 'Vehicle_Group', 'Credit_Group', 'Insurance_Group', 'Gender', 'Marital Status', 'Occupation', 'Location', 'Smoking Status', 'Property Type']]

In [28]:
le = LabelEncoder()

for i in columns_to_encode.columns:
    cleaned_data[i] = le.fit_transform(cleaned_data[i])

In [29]:
cleaned_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Exercise Frequency,Property Type,Premium Amount,Age_Group,Dependent_Group,Health_Group,Prev_Claims_Group,Vehicle_Group,Credit_Group,Insurance_Group
0,0,19.0,0,10049.0,1,1.0,1,1,22.598761,2,...,1,2,2869.0,0,1,1,1,1,1,1
1,1,39.0,0,31678.0,0,3.0,2,0,15.569731,0,...,2,2,1483.0,1,2,1,0,1,2,0
2,2,23.0,1,25602.0,0,3.0,0,1,47.177549,1,...,1,2,567.0,0,2,3,0,1,1,0
3,3,21.0,1,96871.0,1,2.0,1,0,10.938144,0,...,3,0,765.0,0,1,0,0,3,1,0
4,4,21.0,1,39651.0,2,1.0,1,1,20.376094,0,...,1,2,2022.0,0,1,1,2,2,1,1


In [30]:
encoded_data = pd.DataFrame({
    'Age': cleaned_data['Age_Group'],
    'Gender': cleaned_data['Gender'],
    'Annual Income': cleaned_data['Annual Income'],
    'Marital Status': cleaned_data['Marital Status'],
    'Number of Dependents': cleaned_data['Dependent_Group'],
    'Education Level': cleaned_data['Education Level'],
    'Occupation': cleaned_data['Occupation'],
    'Health Score': cleaned_data['Health_Group'],
    'Location': cleaned_data['Location'],
    'Policy Type': cleaned_data['Policy Type'],
    'Previous Claims': cleaned_data['Prev_Claims_Group'],
    'Vehicle Age': cleaned_data['Vehicle_Group'],
    'Credit Score': cleaned_data['Credit_Group'],
    'Insurance Duration': cleaned_data['Insurance_Group'],
    'Customer Feedback': cleaned_data['Customer Feedback'],
    'Smoking Status': cleaned_data['Smoking Status'],
    'Exercise Frequency': cleaned_data['Exercise Frequency'],
    'Property Type': cleaned_data['Property Type'],
    'Premium Amount': cleaned_data['Premium Amount']
})

In [31]:
encoded_data.head()


Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,0,10049.0,1,1,1,1,1,2,2,1,1,1,1,0,0,1,2,2869.0
1,1,0,31678.0,0,2,2,0,1,0,1,0,1,2,0,1,1,2,2,1483.0
2,0,1,25602.0,0,2,0,1,3,1,2,0,1,1,0,2,1,1,2,567.0
3,0,1,96871.0,1,1,1,0,0,0,0,0,3,1,0,0,1,3,0,765.0
4,0,1,39651.0,2,1,1,1,1,0,2,2,2,1,1,0,1,1,2,2022.0


In [32]:
encoded_data.to_csv("C:\\Users\\sarav\\Smart_Premium\\Research_Data\\Encoded_data.csv", index = False)

In [33]:
def log_transform(data, columns_to_transform):
    for col in columns_to_transform:
        data[f'{col}_log'] = np.log1p(data[col])  
        data.drop(columns=[col], inplace=True)  
        data.rename(columns = {f'{col}_log': col}, inplace=True)
    
    return data

In [34]:
transformed_data = log_transform(encoded_data, ['Annual Income'])

In [35]:
transformed_data.head()

Unnamed: 0,Age,Gender,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount,Annual Income
0,0,0,1,1,1,1,1,2,2,1,1,1,1,0,0,1,2,2869.0,9.215328
1,1,0,0,2,2,0,1,0,1,0,1,2,0,1,1,2,2,1483.0,10.363409
2,0,1,0,2,0,1,3,1,2,0,1,1,0,2,1,1,2,567.0,10.150465
3,0,1,1,1,1,0,0,0,0,0,3,1,0,0,1,3,0,765.0,11.481146
4,0,1,2,1,1,1,1,0,2,2,2,1,1,0,1,1,2,2022.0,10.587897


In [36]:
transformed_data.to_csv("C:\\Users\\sarav\\Smart_Premium\\Research_Data\\Transformed_Data.csv")

In [37]:
def scaling(data, columns_to_transform):
    scale = MinMaxScaler()
    for col in columns_to_transform:
        data[f'{col}_log'] = scale.fit_transform(data[[col]])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)  
    
    return data

In [38]:
scaled_data = scaling(transformed_data, ['Annual Income','Premium Amount'])

In [39]:
scaled_data.head()

Unnamed: 0,Age,Gender,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Annual Income,Premium Amount
0,0,0,1,1,1,1,1,2,2,1,1,1,1,0,0,1,2,0.789969,0.955559
1,1,0,0,2,2,0,1,0,1,0,1,2,0,1,1,2,2,0.896391,0.490693
2,0,1,0,2,0,1,3,1,2,0,1,1,0,2,1,1,2,0.876652,0.183465
3,0,1,1,1,1,0,0,0,0,0,3,1,0,0,1,3,0,1.0,0.249874
4,0,1,2,1,1,1,1,0,2,2,2,1,1,0,1,1,2,0.9172,0.671474


In [40]:
scaled_data.to_csv("C:\\Users\\sarav\\Smart_Premium\\Research_Data\\Scaled_Data.csv")

In [41]:
# Encoded_dataset = log_transform(encoded_data, ['Annual Income', 'Premium Amount']).astype(int)

In [42]:
# Encoded_dataset.to_csv("C:\\Users\\sarav\\Smart_Premium\\Research_data\\Encoded_dataset.csv", index = False)

In [43]:
# Encoded_dataset.head()

# Model Building

In [44]:
X = scaled_data.drop('Premium Amount', axis = 1)
Y = scaled_data['Premium Amount']

In [45]:
# ! pip install optuna

In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from bayes_opt import BayesianOptimization
from joblib import dump
from sklearn.feature_selection import SelectKBest, f_regression

# Split the dataset
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=45)

X_train = X_train.reshape(-1, 1) if len(X_train.shape) == 1 else X_train
X_val = X_val.reshape(-1, 1) if len(X_val.shape) == 1 else X_val

Y_train = np.log1p(Y_train)
Y_val = np.log1p(Y_val)

# Feature Selection
selector = SelectKBest(f_regression, k=min(10, X_train.shape[1]))  # Keep top 10 features or all if less
X_train = selector.fit_transform(X_train, Y_train)
X_val = selector.transform(X_val)

# Model configurations
model_params = {
    "LinearRegression": {
        "model": LinearRegression(),
        "params": {}
    },
    "DecisionTreeRegressor": {
        "model": DecisionTreeRegressor(),
        "params": {
            "max_depth": (4, 10),
            "min_samples_split": (2, 10)
        }
    },
    "RandomForestRegressor": {
        "model": RandomForestRegressor(n_jobs=-1),
        "params": {
            "n_estimators": (10, 50),
            "max_depth": (4, 10),
            "min_samples_split": (2, 10)
        }
    },
    "XGBRegressor": {
        "model": XGBRegressor(n_jobs=-1),
        "params": {
            "n_estimators": (10, 50),
            "max_depth": (4, 10),
            "learning_rate": (0.01, 0.2)
        }
    }
}

# Evaluation function
def evaluate_model(model, params):
    model.set_params(**params)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_val)

    Y_val_exp = np.expm1(Y_val)
    Y_pred_exp = np.expm1(Y_pred)

    return np.sqrt(mean_squared_error(Y_val_exp, Y_pred_exp))

# Training models
results = {}

for model_name, mp in model_params.items():
    print(f"Optimizing {model_name}...")

    def objective(**params):
        if "max_depth" in params:
            params["max_depth"] = int(params["max_depth"])
        if "n_estimators" in params:
            params["n_estimators"] = int(params["n_estimators"])
        if "min_samples_split" in params:
            params["min_samples_split"] = int(params["min_samples_split"])

        valid_params = {k: v for k, v in params.items() if k in mp["params"]}

        return -evaluate_model(mp["model"], valid_params)  # Negative for Bayesian Optimization

    if mp["params"]:
        optimizer = BayesianOptimization(
            f=objective,
            pbounds=mp["params"],
            random_state=42
        )
        optimizer.maximize(init_points=3, n_iter=5)  # Reduced search space

        best_params = {k: int(v) if "depth" in k or "n_estimators" in k else v for k, v in optimizer.max["params"].items()}
        best_rmsle = -optimizer.max["target"]  # Convert back to positive

        best_model = mp["model"].set_params(**best_params)
    else:
        best_model = mp["model"]
        best_model.fit(X_train, Y_train)
        best_rmsle = evaluate_model(best_model, {})

    Y_pred = best_model.predict(X_val)
    Y_val_exp = np.expm1(Y_val)
    Y_pred_exp = np.expm1(Y_pred)

    results[model_name] = {
        "Best Model": best_model,
        "Best Params": best_params if mp["params"] else "Default",
        "RMSLE": best_rmsle,
        "RMSE": np.sqrt(mean_squared_error(Y_val_exp, Y_pred_exp)),
        "MAE": mean_absolute_error(Y_val_exp, Y_pred_exp),
        "R2 SCORE": r2_score(Y_val_exp, Y_pred_exp)
    }

# Find best model
best_model_name = min(results, key=lambda x: results[x]["RMSLE"])
best_model_object = results[best_model_name]["Best Model"]

# Print results
for model, metrics in results.items():
    print(f"Model: {model}")
    print(f"RMSLE: {metrics['RMSLE']:.4f}")
    print(f"RMSE: {metrics['RMSE']:.4f}")
    print(f"MAE: {metrics['MAE']:.4f}")
    print(f"R2 SCORE: {metrics['R2 SCORE']:.4f}")
    print(f"Best Parameters: {metrics['Best Params']}")
    print("--" * 10)

print(f"The Best Model: {best_model_name} with RMSLE = {results[best_model_name]['RMSLE']:.4f}")


dump(best_model_object, "best_model.pkl")
print(f"Best model '{best_model_name}' saved to 'best_model.pkl'")


Optimizing LinearRegression...
Optimizing DecisionTreeRegressor...
|   iter    |  target   | max_depth | min_sa... |
-------------------------------------------------
| [39m1        [39m | [39m-0.2676  [39m | [39m6.247    [39m | [39m9.606    [39m |
| [35m2        [39m | [35m-0.2674  [39m | [35m8.392    [39m | [35m6.789    [39m |
| [39m3        [39m | [39m-0.2679  [39m | [39m4.936    [39m | [39m3.248    [39m |
| [39m4        [39m | [39m-0.2674  [39m | [39m8.405    [39m | [39m6.713    [39m |
| [39m5        [39m | [39m-0.2674  [39m | [39m9.995    [39m | [39m9.623    [39m |
| [39m6        [39m | [39m-0.2675  [39m | [39m9.981    [39m | [39m2.155    [39m |
| [39m7        [39m | [39m-0.2674  [39m | [39m9.988    [39m | [39m6.49     [39m |
| [39m8        [39m | [39m-0.2674  [39m | [39m9.939    [39m | [39m4.278    [39m |
Optimizing RandomForestRegressor...
|   iter    |  target   | max_depth | min_sa... | n_esti... |
---------------

# Save the best model

In [48]:
pickle_path = "C:\\Users\\sarav\\Smart_Premium\\pickles\\best_model.pkl"

with open(pickle_path, "wb") as file:
    pickle.dump(best_model_object, file)

print('best_model.pkl saved successfully...')

best_model.pkl saved successfully...
