## SMART PREMIUM - PREDICTING INSURANCE COST WITH MACHINE LEARNING

### INSTALL AND IMPORT REQUIRED PACKAGES

In [1]:
# !pip install pandas numpy streamlit scikit-learn xgboost mlflow matplotlib seaborn

In [2]:
# BASIC LIBRARIES
import pandas as pd
import numpy as np
import streamlit as st

# DATA VISUALIZATION
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# IGNORE WARNING MESSAGES
import warnings
warnings.filterwarnings("ignore")

# MODEL TRAINING
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_regression

# REGRESSION MODELS
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor

# MODEL EVALUATION
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# EXPERIEMENT TRACKING
import mlflow
import mlflow.sklearn

## 1) UNDERSTANDING THE DATA FOR TRAINING AND TESTING

### 1.1) LOAD AND EXPLORE THE DATASET

In [3]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [4]:
train_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [5]:
test_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [6]:
train_data.shape

(1200000, 21)

In [7]:
test_data.shape

(800000, 20)

In [8]:
# train_data.info()

In [9]:
# test_data.info()

In [10]:
train_data.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1181295.0,1155051.0,1090328.0,1125924.0,835971.0,1199994.0,1062118.0,1199999.0,1200000.0
mean,599999.5,41.14556,32745.22,2.009934,25.61391,1.002689,9.569889,592.9244,5.018219,1102.545
std,346410.3,13.53995,32179.51,1.417338,12.20346,0.98284,5.776189,149.9819,2.594331,864.9989
min,0.0,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,299999.8,30.0,8001.0,1.0,15.91896,0.0,5.0,468.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.57865,1.0,10.0,595.0,5.0,872.0
75%,899999.2,53.0,44634.0,3.0,34.52721,2.0,15.0,721.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.97591,9.0,19.0,849.0,9.0,4999.0


In [11]:
test_data.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration
count,800000.0,787511.0,770140.0,726870.0,750551.0,557198.0,799997.0,708549.0,799998.0
mean,1600000.0,41.13644,32803.871471,2.009337,25.613036,1.004873,9.571891,592.904749,5.018949
std,230940.3,13.537829,32201.063749,1.415241,12.206882,0.982803,5.7722,150.116374,2.593759
min,1200000.0,18.0,2.0,0.0,1.646561,0.0,0.0,300.0,1.0
25%,1400000.0,30.0,8048.0,1.0,15.917353,0.0,5.0,468.0,3.0
50%,1600000.0,41.0,23981.0,2.0,24.580164,1.0,10.0,595.0,5.0
75%,1799999.0,53.0,44660.0,3.0,34.517766,2.0,15.0,721.0,7.0
max,1999999.0,64.0,149997.0,4.0,57.957351,9.0,19.0,849.0,9.0


### TARGET VARIABLE FOR TRAIN DATA

In [12]:
train_data["Premium Amount"].describe()

count    1.200000e+06
mean     1.102545e+03
std      8.649989e+02
min      2.000000e+01
25%      5.140000e+02
50%      8.720000e+02
75%      1.509000e+03
max      4.999000e+03
Name: Premium Amount, dtype: float64

### 1.2) EXPLORATORY DATA ANALYSIS

#### CHECK MISSING VALUES

In [13]:
train_data.isnull().sum()

id                           0
Age                      18705
Gender                       0
Annual Income            44949
Marital Status           18529
Number of Dependents    109672
Education Level              0
Occupation              358075
Health Score             74076
Location                     0
Policy Type                  0
Previous Claims         364029
Vehicle Age                  6
Credit Score            137882
Insurance Duration           1
Policy Start Date            0
Customer Feedback        77824
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium Amount               0
dtype: int64

In [14]:
test_data.isnull().sum()

id                           0
Age                      12489
Gender                       0
Annual Income            29860
Marital Status           12336
Number of Dependents     73130
Education Level              0
Occupation              239125
Health Score             49449
Location                     0
Policy Type                  0
Previous Claims         242802
Vehicle Age                  3
Credit Score             91451
Insurance Duration           2
Policy Start Date            0
Customer Feedback        52276
Smoking Status               0
Exercise Frequency           0
Property Type                0
dtype: int64

#### TRAINING - PLOT FOR NUMERICAL DISTRIBUTION

In [15]:
num_cols = ["Age", "Annual Income", "Health Score", "Credit Score", "Premium Amount"]
train_data[num_cols].hist(figsize=(12, 8), bins=50)
plt.show()

#### CHECK CORRELATIONS

In [124]:
corr_matrix = train_data[num_cols].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

#### CHECK CATEGORICAL FEATURE DISTRIBUTION

In [17]:
cat_cols = ["Gender", "Marital Status", "Education Level", "Policy Type", "Property Type"]

plt.figure(figsize=(14, 10))
for i, col in enumerate(cat_cols, 1):
    plt.subplot(3, 2, i)
    sns.countplot(y=train_data[col], order=train_data[col].value_counts().index, palette="viridis")
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

In [18]:
np.random.seed(42)
tr_data = pd.DataFrame({
    "Premium Amount": np.random.normal(1100, 850, 1000).clip(20, 5000),
    "Age": np.random.normal(40, 13, 1000).clip(18, 64),
    "Annual Income": np.random.normal(32000, 32000, 1000).clip(1000, 150000),
    "Credit Score": np.random.normal(600, 150, 1000).clip(300, 850)
})

# Histograms
plt.figure(figsize=(12, 6))
for i, col in enumerate(tr_data.columns, 1):
    plt.subplot(2, 2, i)
    sns.histplot(tr_data[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Boxplots
plt.figure(figsize=(12, 6))
for i, col in enumerate(["Premium Amount", "Annual Income", "Credit Score"], 1):
    plt.subplot(1, 3, i)
    sns.boxplot(y=tr_data[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()

### TESTING - HANDLING MISSING VALUES

In [19]:
num_features = ["Age", "Annual Income", "Number of Dependents", "Health Score", "Credit Score", "Previous Claims"]
for col in num_features:
    test_data[col].fillna(test_data[col].median(), inplace=True)

cat_features = ["Marital Status", "Occupation", "Customer Feedback"]
for col in cat_features:
    test_data[col].fillna(test_data[col].mode()[0], inplace=True)

### ENCODING CATEGORICAL VARIABLE

#### LABEL ENCODING FOR BINARY CATEGORICAL FEATURES

In [20]:
binary_cols = ["Gender", "Smoking Status", "Property Type"]

test_data["Gender"] = test_data["Gender"].map({"Male": 0, "Female": 1})
test_data["Smoking Status"] = test_data["Smoking Status"].map({"No": 0, "Yes": 1})
test_data["Property Type"] = test_data["Property Type"].map({"House": 0, "Apartment": 1})

#### ONE-HOT ENCODING FOR MULTI-CLASS FEATURES

In [21]:
multi_cat_features = ["Marital Status", "Occupation", "Policy Type", "Exercise Frequency", "Location"]
test_data = pd.get_dummies(test_data, columns=multi_cat_features, drop_first=True)

#### FEATURE SCALING

In [22]:
scaler = StandardScaler()
num_cols = ["Age", "Annual Income", "Credit Score", "Health Score", "Insurance Duration"]

test_data[num_cols] = scaler.fit_transform(test_data[num_cols])

## 2) DATA PREPROCESSING FOR TRAINING AND TESTING

### 2.1) HANDLE MISSING VALUES

In [23]:
num_cols = ['Age', 'Annual Income', 'Health Score', 'Previous Claims', 
            'Vehicle Age', 'Credit Score', 'Insurance Duration']
cat_cols = ['Marital Status', 'Occupation', 'Customer Feedback']

train_data.columns = train_data.columns.str.strip()
test_data.columns = test_data.columns.str.strip()

for col in num_cols:
    if col in train_data.columns:
        train_data[col].fillna(train_data[col].median(), inplace=True)
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].median(), inplace=True)

for col in cat_cols:
    if col in train_data.columns:
        train_data[col].fillna(train_data[col].mode()[0], inplace=True)
    if col in test_data.columns:
        test_data[col].fillna(test_data[col].mode()[0], inplace=True)

### 2.2) CONVERT CATEGORICAL VARIABLE TO NUMERICAL FORM

#### LABEL ENCODING

In [24]:
binary_cols = ['Gender', 'Smoking Status']
multi_class_cols = ['Marital Status', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Property Type']

for col in binary_cols:
    if col in train_data.columns:
        label_encoder = LabelEncoder()
        train_data[col] = label_encoder.fit_transform(train_data[col])

        test_data[col] = test_data[col].apply(lambda x: x if x in label_encoder.classes_ else "Unknown")
        
        label_encoder.classes_ = np.append(label_encoder.classes_, "Unknown")
        
        test_data[col] = label_encoder.transform(test_data[col])

#### ONE-HOT ENCODING

In [25]:
if "Property Type" in test_data.columns:
    test_data.drop(columns=["Property Type"], inplace=True)

expected_columns = ['Property Type_Condo', 'Property Type_House']
for col in expected_columns:
    if col not in test_data.columns:
        test_data[col] = 0

train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

In [26]:
print("Existing columns in train_data:", train_data.columns)
print("multi_class_cols:", multi_class_cols)

Existing columns in train_data: Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type', 'Premium Amount'],
      dtype='object')
multi_class_cols: ['Marital Status', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Property Type']


In [27]:
multi_class_cols = [col for col in multi_class_cols if col in train_data.columns]
print("Updated multi_class_cols:", multi_class_cols)


Updated multi_class_cols: ['Marital Status', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Property Type']


In [28]:
train_data['Number of Dependents'].fillna(train_data['Number of Dependents'].median(), inplace=True)

In [29]:
if multi_class_cols:
    train_data = pd.get_dummies(train_data, columns=multi_class_cols, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=multi_class_cols, drop_first=True)
else:
    print("No categorical columns to encode. Skipping one-hot encoding.")

In [30]:
print(set(train_data.columns) - set(test_data.columns))

{'Marital Status_Married', 'Property Type_Condo', 'Policy Type_Premium', 'Marital Status_Single', 'Property Type_House', 'Occupation_Unemployed', 'Occupation_Self-Employed', 'Location_Suburban', 'Policy Type_Comprehensive', 'Location_Urban'}


In [31]:
print("Columns in test_data:", test_data.columns)
print("Property Type columns in test_data:", [col for col in test_data.columns if 'Property Type' in col])

expected_columns = ['Property Type_Condo', 'Property Type_House']
for col in expected_columns:
    if col not in test_data.columns:
        test_data[col] = 0  

train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

Columns in test_data: Index(['id', 'Age', 'Gender', 'Annual Income', 'Number of Dependents',
       'Education Level', 'Health Score', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Smoking Status', 'Exercise Frequency', 'Premium Amount',
       'Customer Feedback_Good', 'Customer Feedback_Poor'],
      dtype='object')
Property Type columns in test_data: []


In [32]:
train_data

Unnamed: 0,id,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Previous Claims,Vehicle Age,Credit Score,...,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Property Type_Condo,Property Type_House
0,0,19.0,0,10049.0,1.0,Bachelor's,22.598761,2.0,17.0,372.0,...,True,False,False,True,False,True,False,True,False,True
1,1,39.0,0,31678.0,3.0,Master's,15.569731,1.0,12.0,694.0,...,False,False,False,False,True,False,False,False,False,True
2,2,23.0,1,25602.0,3.0,High School,47.177549,1.0,14.0,595.0,...,True,False,True,False,False,True,True,False,False,True
3,3,21.0,1,141855.0,2.0,Bachelor's,10.938144,1.0,0.0,367.0,...,False,False,False,False,False,False,False,True,False,False
4,4,21.0,1,39651.0,1.0,Bachelor's,20.376094,0.0,8.0,598.0,...,True,False,False,False,False,True,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,0,27316.0,0.0,Master's,13.772907,1.0,5.0,372.0,...,False,True,False,True,False,True,False,True,False,False
1199996,1199996,54.0,1,35786.0,2.0,Master's,11.483482,1.0,10.0,597.0,...,True,False,False,False,True,False,False,True,False,False
1199997,1199997,19.0,1,51884.0,0.0,Master's,14.724469,0.0,19.0,595.0,...,False,False,True,False,False,False,True,False,True,False
1199998,1199998,55.0,1,23911.0,1.0,PhD,18.547381,1.0,7.0,407.0,...,False,False,True,False,False,True,False,True,False,False


In [33]:
test_data

Unnamed: 0,id,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Previous Claims,Vehicle Age,Credit Score,...,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Property Type_Condo,Property Type_House
0,1200000,-0.977856,2,-0.953410,4.0,Bachelor's,-1.512844,1.0,19.0,0.013135,...,0,0,0,0,0,0,False,True,0,0
1,1200001,-0.754504,2,2.957034,2.0,Master's,-1.028885,1.0,14.0,-1.565320,...,0,0,0,0,0,0,True,False,0,0
2,1200002,0.436704,2,-0.486196,0.0,PhD,-0.101018,1.0,16.0,1.598669,...,0,0,0,0,0,0,False,False,0,0
3,1200003,-0.977856,2,-0.064812,3.0,PhD,-1.726078,1.0,3.0,1.251834,...,0,0,0,0,0,0,False,True,0,0
4,1200004,-1.275658,2,-0.683075,2.0,High School,-1.158870,1.0,14.0,1.145659,...,0,0,0,0,0,0,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,1999995,0.660055,2,0.199359,1.0,Bachelor's,-0.934412,1.0,8.0,-2.011251,...,0,0,0,0,0,0,False,False,0,0
799996,1999996,-0.009999,2,1.295488,0.0,Master's,-1.471599,2.0,0.0,0.013135,...,0,0,0,0,0,0,True,False,0,0
799997,1999997,-1.126757,2,0.085448,0.0,Master's,-1.599211,1.0,10.0,0.013135,...,0,0,0,0,0,0,False,True,0,0
799998,1999998,-0.531153,2,0.416783,3.0,Master's,-0.812766,2.0,17.0,-0.892884,...,0,0,0,0,0,0,False,False,0,0


In [34]:
train_data.isnull().sum()

id                           0
Age                          0
Gender                       0
Annual Income                0
Number of Dependents         0
Education Level              0
Health Score                 0
Previous Claims              0
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Smoking Status               0
Exercise Frequency           0
Premium Amount               0
Marital Status_Married       0
Marital Status_Single        0
Occupation_Self-Employed     0
Occupation_Unemployed        0
Location_Suburban            0
Location_Urban               0
Policy Type_Comprehensive    0
Policy Type_Premium          0
Customer Feedback_Good       0
Customer Feedback_Poor       0
Property Type_Condo          0
Property Type_House          0
dtype: int64

In [35]:
train_data.isnull().sum().sum()

np.int64(0)

In [36]:
test_data.isnull().sum()

id                           0
Age                          0
Gender                       0
Annual Income                0
Number of Dependents         0
Education Level              0
Health Score                 0
Previous Claims              0
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Smoking Status               0
Exercise Frequency           0
Premium Amount               0
Marital Status_Married       0
Marital Status_Single        0
Occupation_Self-Employed     0
Occupation_Unemployed        0
Location_Suburban            0
Location_Urban               0
Policy Type_Comprehensive    0
Policy Type_Premium          0
Customer Feedback_Good       0
Customer Feedback_Poor       0
Property Type_Condo          0
Property Type_House          0
dtype: int64

In [37]:
test_data.isnull().sum().sum()

np.int64(0)

In [38]:
bool_cols = train_data.select_dtypes(include=['bool']).columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)

In [39]:
train_data['Policy Start Date'] = pd.to_datetime(train_data['Policy Start Date'], errors='coerce')
test_data['Policy Start Date'] = pd.to_datetime(test_data['Policy Start Date'], errors='coerce')


In [40]:
train_gender_mode = train_data['Gender'].mode()[0]  # Get most frequent value
test_data['Gender'] = test_data['Gender'].apply(lambda x: x if x in [0, 1] else train_gender_mode)

train_smoking_mode = train_data['Smoking Status'].mode()[0]
test_data['Smoking Status'] = test_data['Smoking Status'].apply(lambda x: x if x in [0, 1] else train_smoking_mode)


In [41]:
bool_cols = train_data.select_dtypes(include=['bool']).columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)


### CHECKING DATATYPES

In [42]:
print(train_data.dtypes)
print(test_data.dtypes)

id                                    int64
Age                                 float64
Gender                                int64
Annual Income                       float64
Number of Dependents                float64
Education Level                      object
Health Score                        float64
Previous Claims                     float64
Vehicle Age                         float64
Credit Score                        float64
Insurance Duration                  float64
Policy Start Date            datetime64[ns]
Smoking Status                        int64
Exercise Frequency                   object
Premium Amount                      float64
Marital Status_Married                int64
Marital Status_Single                 int64
Occupation_Self-Employed              int64
Occupation_Unemployed                 int64
Location_Suburban                     int64
Location_Urban                        int64
Policy Type_Comprehensive             int64
Policy Type_Premium             

In [43]:
for col in ['Gender', 'Smoking Status']: 
    print(f"Unique values in {col} (Train):", train_data[col].unique())
    print(f"Unique values in {col} (Test):", test_data[col].unique())


Unique values in Gender (Train): [0 1]
Unique values in Gender (Test): [1]
Unique values in Smoking Status (Train): [0 1]
Unique values in Smoking Status (Test): [1]


In [44]:
for col in train_data.columns:
    if 'Marital Status' in col or 'Occupation' in col:
        print(f"Unique values in {col} (Train):", train_data[col].unique())


Unique values in Marital Status_Married (Train): [1 0]
Unique values in Marital Status_Single (Train): [0 1]
Unique values in Occupation_Self-Employed (Train): [1 0]
Unique values in Occupation_Unemployed (Train): [0 1]


In [45]:
print("Columns in Train Data:", train_data.columns)
print("Columns in Test Data:", test_data.columns)


Columns in Train Data: Index(['id', 'Age', 'Gender', 'Annual Income', 'Number of Dependents',
       'Education Level', 'Health Score', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Smoking Status', 'Exercise Frequency', 'Premium Amount',
       'Marital Status_Married', 'Marital Status_Single',
       'Occupation_Self-Employed', 'Occupation_Unemployed',
       'Location_Suburban', 'Location_Urban', 'Policy Type_Comprehensive',
       'Policy Type_Premium', 'Customer Feedback_Good',
       'Customer Feedback_Poor', 'Property Type_Condo', 'Property Type_House'],
      dtype='object')
Columns in Test Data: Index(['id', 'Age', 'Gender', 'Annual Income', 'Number of Dependents',
       'Education Level', 'Health Score', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Smoking Status', 'Exercise Frequency', 'Premium Amount',
       'Marital Status_Married', 'Marital Status

In [46]:
print("Missing values in Train Data:\n", train_data.isnull().sum())
print("Missing values in Test Data:\n", test_data.isnull().sum())


Missing values in Train Data:
 id                           0
Age                          0
Gender                       0
Annual Income                0
Number of Dependents         0
Education Level              0
Health Score                 0
Previous Claims              0
Vehicle Age                  0
Credit Score                 0
Insurance Duration           0
Policy Start Date            0
Smoking Status               0
Exercise Frequency           0
Premium Amount               0
Marital Status_Married       0
Marital Status_Single        0
Occupation_Self-Employed     0
Occupation_Unemployed        0
Location_Suburban            0
Location_Urban               0
Policy Type_Comprehensive    0
Policy Type_Premium          0
Customer Feedback_Good       0
Customer Feedback_Poor       0
Property Type_Condo          0
Property Type_House          0
dtype: int64
Missing values in Test Data:
 id                           0
Age                          0
Gender                     

In [47]:
print("Are train and test columns identical?", set(train_data.columns) == set(test_data.columns))

Are train and test columns identical? True


### 2.3) SPLIT DATA INTO TRAINING AND TESTING/EVALUATION SETS

In [48]:
X = train_data.drop(columns=['Premium Amount', 'id'])  
y = train_data['Premium Amount']

# Split into training (80%) and validation (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [49]:
X_train

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,...,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Property Type_Condo,Property Type_House
404339,45.0,1,91315.0,3.0,Bachelor's,43.926883,1.0,1.0,311.0,2.0,...,0,0,0,1,1,0,0,1,0,1
748487,51.0,0,5280.0,1.0,PhD,41.854227,1.0,1.0,634.0,8.0,...,0,0,0,0,0,0,0,1,1,0
435951,48.0,0,41318.0,0.0,Master's,51.750192,1.0,12.0,759.0,9.0,...,0,0,0,1,1,0,0,1,1,0
311284,43.0,0,30372.0,0.0,PhD,20.666786,2.0,8.0,460.0,1.0,...,0,1,0,1,1,0,0,0,1,0
318790,18.0,1,12891.0,2.0,Bachelor's,36.995102,1.0,18.0,831.0,3.0,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,30.0,0,2789.0,4.0,High School,15.965779,1.0,18.0,328.0,5.0,...,0,0,1,0,0,0,1,0,0,0
259178,35.0,1,72512.0,2.0,Bachelor's,23.914243,0.0,16.0,338.0,9.0,...,0,0,0,1,1,0,0,1,0,1
131932,43.0,1,85463.0,0.0,Bachelor's,25.151808,0.0,8.0,849.0,3.0,...,0,0,0,1,0,1,0,0,1,0
671155,49.0,1,5908.0,0.0,Bachelor's,41.637977,0.0,2.0,444.0,1.0,...,0,0,0,0,1,0,1,0,1,0


In [50]:
X_test

Unnamed: 0,Age,Gender,Annual Income,Number of Dependents,Education Level,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,...,Occupation_Self-Employed,Occupation_Unemployed,Location_Suburban,Location_Urban,Policy Type_Comprehensive,Policy Type_Premium,Customer Feedback_Good,Customer Feedback_Poor,Property Type_Condo,Property Type_House
372605,52.0,0,5538.0,2.0,Master's,8.093145,0.0,18.0,497.0,3.0,...,0,0,0,0,0,0,0,0,0,0
551204,60.0,0,39711.0,4.0,Bachelor's,24.578648,0.0,3.0,340.0,3.0,...,0,0,1,0,1,0,0,1,0,0
240320,58.0,0,2364.0,2.0,Bachelor's,34.452482,2.0,19.0,595.0,7.0,...,0,0,0,1,1,0,0,1,0,1
1047361,39.0,1,9094.0,0.0,Bachelor's,26.241661,1.0,17.0,761.0,5.0,...,0,0,0,0,0,0,0,1,1,0
555362,47.0,1,43189.0,2.0,PhD,26.691921,1.0,4.0,319.0,4.0,...,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458271,46.0,1,32875.0,1.0,Bachelor's,32.140638,0.0,9.0,627.0,6.0,...,0,0,0,1,0,1,1,0,0,1
1124744,39.0,1,46820.0,1.0,PhD,6.990761,2.0,14.0,614.0,5.0,...,0,0,1,0,1,0,0,0,0,0
720454,18.0,0,27596.0,1.0,Bachelor's,14.538865,0.0,6.0,678.0,3.0,...,1,0,0,0,0,1,0,1,1,0
256458,30.0,1,110140.0,0.0,Bachelor's,24.578648,1.0,8.0,524.0,6.0,...,0,0,0,0,0,0,0,0,0,1


In [51]:
y_train

404339    2386.0
748487     285.0
435951     654.0
311284    1095.0
318790    1334.0
           ...  
110268     538.0
259178     819.0
131932     403.0
671155     542.0
121958    2619.0
Name: Premium Amount, Length: 960000, dtype: float64

In [52]:
y_test

372605     2742.0
551204     1347.0
240320     2196.0
1047361     684.0
555362     1714.0
            ...  
458271     1286.0
1124744      24.0
720454      971.0
256458      562.0
423212     1484.0
Name: Premium Amount, Length: 240000, dtype: float64

In [53]:
X_train.shape

(960000, 25)

In [54]:
X_test.shape

(240000, 25)

### 2.4) FEATURE SCALING

In [55]:
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("Mean after scaling:\n", X_train[num_cols].mean())
print("Standard deviation after scaling:\n", X_train[num_cols].std())

Mean after scaling:
 Age                          1.409687e-16
Gender                       1.289746e-16
Annual Income                7.934394e-18
Number of Dependents        -1.273944e-16
Health Score                -1.001332e-15
Previous Claims              1.328049e-16
Vehicle Age                  6.781612e-17
Credit Score                 1.247706e-17
Insurance Duration          -9.965362e-17
Smoking Status               1.581698e-17
Marital Status_Married      -6.325311e-17
Marital Status_Single       -2.754093e-17
Occupation_Self-Employed     8.852178e-18
Occupation_Unemployed       -2.320366e-17
Location_Suburban           -3.125648e-17
Location_Urban               4.339677e-17
Policy Type_Comprehensive   -9.375833e-17
Policy Type_Premium          1.442309e-16
Customer Feedback_Good       1.300071e-17
Customer Feedback_Poor       3.143596e-17
Property Type_Condo          1.615004e-17
Property Type_House         -1.318945e-17
dtype: float64
Standard deviation after scaling:
 Age  

In [56]:
num_cols = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 
            'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']

print("Mean after scaling:\n", np.round(X_train[num_cols].mean(), 4).abs())

print("Standard deviation after scaling:\n", np.round(X_train[num_cols].std(), 4))

Mean after scaling:
 Age                     0.0
Annual Income           0.0
Number of Dependents    0.0
Health Score            0.0
Previous Claims         0.0
Vehicle Age             0.0
Credit Score            0.0
Insurance Duration      0.0
dtype: float64
Standard deviation after scaling:
 Age                     1.0
Annual Income           1.0
Number of Dependents    1.0
Health Score            1.0
Previous Claims         1.0
Vehicle Age             1.0
Credit Score            1.0
Insurance Duration      1.0
dtype: float64


In [57]:
date_cols = X_train.select_dtypes(include=['datetime64[ns]']).columns
print("Datetime columns:", date_cols)

for col in date_cols:
    X_train[col] = X_train[col].astype('int64') // 10**9 
    X_test[col] = X_test[col].astype('int64') // 10**9

Datetime columns: Index(['Policy Start Date'], dtype='object')


In [58]:
# Identify categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Apply one-hot encoding
encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
encoded_train = encoder.fit_transform(X_train[categorical_cols])
encoded_test = encoder.transform(X_test[categorical_cols])

# Convert encoded arrays into DataFrames
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_cols), index=X_train.index)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(categorical_cols), index=X_test.index)

# Drop original categorical columns and add encoded ones
X_train = pd.concat([X_train.drop(categorical_cols, axis=1), encoded_train_df], axis=1)
X_test = pd.concat([X_test.drop(categorical_cols, axis=1), encoded_test_df], axis=1)

print("✅ Encoding completed successfully.")

✅ Encoding completed successfully.


## 3) MODEL DEVELOPMENT

### 3.1) LINEAR REGRESSION MODEL TRAINING AND EVALUATION

In [59]:
linear_model = LinearRegression()

In [60]:
linear_model

In [61]:
# Fit the model to the training data
linear_model.fit(X_train, y_train)

In [62]:
linear_prediction = linear_model.predict(X_test)

In [63]:
linear_prediction

array([1083.83018201, 1096.77303322, 1163.25680759, ..., 1025.94916875,
       1095.6002723 , 1105.74044759], shape=(240000,))

In [64]:
linear_mae = mean_absolute_error(y_test, linear_prediction)

In [65]:
linear_mae

667.2820835292639

In [66]:
linear_rmse = np.sqrt(mean_squared_error(y_test, linear_prediction))

In [67]:
linear_rmse

np.float64(863.2879733346389)

In [68]:
linear_r2 = r2_score(y_test, linear_prediction)

In [69]:
linear_r2

0.0027023955200473626

In [70]:
print("Linear Regression Results (Tuned):")
print(f"RMSE: {linear_rmse}\nMAE: {linear_mae}\nR² Score: {linear_r2}\n")

Linear Regression Results (Tuned):
RMSE: 863.2879733346389
MAE: 667.2820835292639
R² Score: 0.0027023955200473626



### 3.2) DECISION TREE MODEL TRAINING AND EVALUATION

In [71]:
decision_model = DecisionTreeRegressor(max_depth=5, min_samples_split=5, min_samples_leaf=2, random_state=42)

In [72]:
decision_model

In [73]:
decision_model.fit(X_train, y_train)

In [74]:
decision_prediction = decision_model.predict(X_test)

In [75]:
decision_prediction

array([1098.66566162, 1171.30322002, 1125.00030276, ..., 1074.91541061,
        734.85097233, 1074.91541061], shape=(240000,))

In [76]:
decision_mae = mean_absolute_error(y_test, decision_prediction)

In [77]:
decision_mae

654.1521262431141

In [78]:
decision_rmse = np.sqrt(mean_squared_error(y_test, decision_prediction))

In [79]:
decision_rmse

np.float64(853.1388455727017)

In [80]:
decision_r2 = r2_score(y_test, decision_prediction)

In [81]:
decision_r2

0.02601374449099736

In [82]:
print("Decision Tree Results (Tuned):")
print(f"RMSE: {decision_rmse}\nMAE: {decision_mae}\nR² Score: {decision_r2}")

Decision Tree Results (Tuned):
RMSE: 853.1388455727017
MAE: 654.1521262431141
R² Score: 0.02601374449099736


### 3.3) RANDOM FOREST MODEL TRAINING AND EVALUATION

In [83]:
random_model = RandomForestRegressor(n_estimators=100, max_depth=7, min_samples_split=5, min_samples_leaf=2, random_state=42)

In [84]:
random_model

In [85]:
random_model.fit(X_train, y_train)

In [86]:
random_prediction = random_model.predict(X_test)

In [87]:
random_mae = mean_absolute_error(y_test, random_prediction)

In [88]:
random_rmse = np.sqrt(mean_squared_error(y_test, random_prediction))

In [89]:
random_r2 = r2_score(y_test, random_prediction)

In [90]:
print("Random Forest Results:")
print(f"RMSE: {random_rmse}\nMAE: {random_mae}\nR² Score: {random_r2}")

Random Forest Results:
RMSE: 846.3816042319959
MAE: 644.2529314669614
R² Score: 0.041381456723150634


### 3.4) XGBOOST MODEL TRAINING AND EVALUATION

In [91]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42)

In [92]:
xgb_model

In [93]:
xgb_model.fit(X_train, y_train)

In [94]:
xgb_prediction = xgb_model.predict(X_test)

In [95]:
xgb_prediction

array([1175.4467, 1211.8829, 1140.1031, ..., 1070.6361, 1269.1866,
       1057.8207], shape=(240000,), dtype=float32)

In [96]:
xgb_mae = mean_absolute_error(y_test, xgb_prediction)

In [97]:
xgb_mae

647.5296768765768

In [98]:
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_prediction))

In [99]:
xgb_rmse

np.float64(846.3178934416417)

In [100]:
xgb_r2 = r2_score(y_test, xgb_prediction)

In [101]:
xgb_r2

0.041525769988161376

In [102]:
print("XGBoost Results:")
print(f"RMSE: {xgb_rmse}\nMAE: {xgb_mae}\nR² Score: {xgb_r2}")

XGBoost Results:
RMSE: 846.3178934416417
MAE: 647.5296768765768
R² Score: 0.041525769988161376


## 4) ML-PIPELINE AND ML-FLOW INTEGRATION 

### 4.1) ML-PIPELINE

In [103]:
pipeline = Pipeline([
    ('model', XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42))
])

In [104]:
pipeline

In [105]:
pipeline.fit(X_train, y_train)

In [106]:
pipeline_prediction = pipeline.predict(X_test)

In [107]:
pipeline_prediction

array([1175.4467, 1211.8829, 1140.1031, ..., 1070.6361, 1269.1866,
       1057.8207], shape=(240000,), dtype=float32)

In [108]:
pipeline_mae = mean_absolute_error(y_test, pipeline_prediction)

In [109]:
pipeline_mae

647.5296768765768

In [110]:
pipeline_rmse = np.sqrt(mean_squared_error(y_test, pipeline_prediction))

In [111]:
pipeline_rmse

np.float64(846.3178934416417)

In [112]:
pipeline_r2 = r2_score(y_test, pipeline_prediction)

In [113]:
pipeline_r2

0.041525769988161376

In [114]:
mlflow.log_param('model_type', 'XGBoost')

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



'XGBoost'

In [115]:
mlflow.log_metric('rmse', pipeline_rmse)

In [116]:
mlflow.log_metric('mae', pipeline_mae)

In [117]:
mlflow.log_metric('r2_score', pipeline_r2)

In [118]:
input_example = X_test[:1]
mlflow.sklearn.log_model(pipeline, 'xgb_model_pipeline', input_example=input_example)

<mlflow.models.model.ModelInfo at 0x1cd0cc88380>

In [119]:
print("XGBoost Pipeline Results:")
print(f"RMSE: {pipeline_rmse}\\nMAE: {pipeline_mae}\\nR² Score: {pipeline_r2}")

XGBoost Pipeline Results:
RMSE: 846.3178934416417\nMAE: 647.5296768765768\nR² Score: 0.041525769988161376


## 5) MODEL DEPLOYMENT WITH STREAMLIT 

In [120]:
latest_version = max([int(v.version) for v in mlflow.MlflowClient().get_latest_versions("xgb_model_pipeline")])

In [121]:
latest_version

4

In [122]:
model = mlflow.sklearn.load_model(f'models:/xgb_model_pipeline/{latest_version}')

In [123]:
model

### CONTINUATION IN A STREAMLIT APP