## SMART PREMIUM - PREDICTING INSURANCE COST WITH MACHINE LEARNING

### INSTALL AND IMPORT REQUIRED PACKAGES

In [82]:
# !pip install pandas numpy streamlit scikit-learn xgboost mlflow matplotlib seaborn optuna

In [3]:
# BASIC LIBRARIES
import pandas as pd
import numpy as np
import streamlit as st
import optuna


# DATA VISUALIZATION
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# IGNORE WARNING MESSAGES
import warnings
warnings.filterwarnings("ignore")

# MODEL TRAINING
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import make_regression

# REGRESSION MODELS
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor, plot_importance

# MODEL EVALUATION
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# EXPERIEMENT TRACKING
import mlflow
import mlflow.sklearn

## 1) UNDERSTANDING THE DATA FOR TRAINING AND TESTING

### 1.1) LOAD AND EXPLORE THE DATASET

### DIRECTORY FOR TRAIN AND TEST DATA

In [4]:
train_data = "/content/train.csv"

In [5]:
test_data = "/content/test.csv"

#### TRAIN DATA

In [6]:
train_data = pd.read_csv(train_data)
test_data = pd.read_csv(test_data)

In [7]:
train_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


#### TEST DATA

In [8]:
test_data.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


### 1.2) PERFORM EXPLORATORY DATA ANALYSIS FOR TRAIN AND TEST DATA

In [9]:
train_data.shape

(1200000, 21)

In [10]:
train_data.columns

Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type', 'Premium Amount'],
      dtype='object')

In [11]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1200000 non-null  int64  
 1   Age                   1181295 non-null  float64
 2   Gender                1200000 non-null  object 
 3   Annual Income         1155051 non-null  float64
 4   Marital Status        1181471 non-null  object 
 5   Number of Dependents  1090328 non-null  float64
 6   Education Level       1200000 non-null  object 
 7   Occupation            841925 non-null   object 
 8   Health Score          1125924 non-null  float64
 9   Location              1200000 non-null  object 
 10  Policy Type           1200000 non-null  object 
 11  Previous Claims       835971 non-null   float64
 12  Vehicle Age           1199994 non-null  float64
 13  Credit Score          1062118 non-null  float64
 14  Insurance Duration    1199999 non-

In [12]:
train_data.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,1200000.0,1181295.0,1155051.0,1090328.0,1125924.0,835971.0,1199994.0,1062118.0,1199999.0,1200000.0
mean,599999.5,41.14556,32745.22,2.009934,25.61391,1.002689,9.569889,592.9244,5.018219,1102.545
std,346410.3,13.53995,32179.51,1.417338,12.20346,0.98284,5.776189,149.9819,2.594331,864.9989
min,0.0,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,299999.8,30.0,8001.0,1.0,15.91896,0.0,5.0,468.0,3.0,514.0
50%,599999.5,41.0,23911.0,2.0,24.57865,1.0,10.0,595.0,5.0,872.0
75%,899999.2,53.0,44634.0,3.0,34.52721,2.0,15.0,721.0,7.0,1509.0
max,1199999.0,64.0,149997.0,4.0,58.97591,9.0,19.0,849.0,9.0,4999.0


In [13]:
train_data.describe(include = "object")

Unnamed: 0,Gender,Marital Status,Education Level,Occupation,Location,Policy Type,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
count,1200000,1181471,1200000,841925,1200000,1200000,1200000,1122176,1200000,1200000,1200000
unique,2,3,4,3,3,3,167381,3,2,4,3
top,Male,Single,Master's,Employed,Suburban,Premium,2020-02-08 15:21:39.134960,Average,Yes,Weekly,House
freq,602571,395391,303818,282750,401542,401846,142,377905,601873,306179,400349


In [14]:
train_data.nunique()

Unnamed: 0,0
id,1200000
Age,47
Gender,2
Annual Income,88593
Marital Status,3
Number of Dependents,5
Education Level,4
Occupation,3
Health Score,532657
Location,3


In [15]:
test_data.shape

(800000, 20)

In [16]:
test_data.columns

Index(['id', 'Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Policy Start Date',
       'Customer Feedback', 'Smoking Status', 'Exercise Frequency',
       'Property Type'],
      dtype='object')

In [17]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    800000 non-null  int64  
 1   Age                   787511 non-null  float64
 2   Gender                800000 non-null  object 
 3   Annual Income         770140 non-null  float64
 4   Marital Status        787664 non-null  object 
 5   Number of Dependents  726870 non-null  float64
 6   Education Level       800000 non-null  object 
 7   Occupation            560875 non-null  object 
 8   Health Score          750551 non-null  float64
 9   Location              800000 non-null  object 
 10  Policy Type           800000 non-null  object 
 11  Previous Claims       557198 non-null  float64
 12  Vehicle Age           799997 non-null  float64
 13  Credit Score          708549 non-null  float64
 14  Insurance Duration    799998 non-null  float64
 15  

In [18]:
test_data.describe()

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration
count,800000.0,787511.0,770140.0,726870.0,750551.0,557198.0,799997.0,708549.0,799998.0
mean,1600000.0,41.13644,32803.871471,2.009337,25.613036,1.004873,9.571891,592.904749,5.018949
std,230940.3,13.537829,32201.063749,1.415241,12.206882,0.982803,5.7722,150.116374,2.593759
min,1200000.0,18.0,2.0,0.0,1.646561,0.0,0.0,300.0,1.0
25%,1400000.0,30.0,8048.0,1.0,15.917353,0.0,5.0,468.0,3.0
50%,1600000.0,41.0,23981.0,2.0,24.580164,1.0,10.0,595.0,5.0
75%,1799999.0,53.0,44660.0,3.0,34.517766,2.0,15.0,721.0,7.0
max,1999999.0,64.0,149997.0,4.0,57.957351,9.0,19.0,849.0,9.0


In [19]:
test_data.describe(include = "object")

Unnamed: 0,Gender,Marital Status,Education Level,Occupation,Location,Policy Type,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
count,800000,787664,800000,560875,800000,800000,800000,747724,800000,800000,800000
unique,2,3,4,3,3,3,158776,3,2,4,3
top,Male,Single,Master's,Employed,Suburban,Premium,2022-08-30 15:21:39.134960,Average,Yes,Weekly,House
freq,401089,263705,202552,188574,267190,267629,98,251217,401859,204514,267151


In [20]:
test_data.nunique()

Unnamed: 0,0
id,800000
Age,47
Gender,2
Annual Income,80716
Marital Status,3
Number of Dependents,5
Education Level,4
Occupation,3
Health Score,388702
Location,3


### HANDLING MISSING VALUES

In [21]:
train_data.isnull().sum()

Unnamed: 0,0
id,0
Age,18705
Gender,0
Annual Income,44949
Marital Status,18529
Number of Dependents,109672
Education Level,0
Occupation,358075
Health Score,74076
Location,0


In [22]:
test_data.isnull().sum()

Unnamed: 0,0
id,0
Age,12489
Gender,0
Annual Income,29860
Marital Status,12336
Number of Dependents,73130
Education Level,0
Occupation,239125
Health Score,49449
Location,0


In [23]:
numerical_columns = train_data.select_dtypes(include=['number'])

In [24]:
numerical_columns

Unnamed: 0,id,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
0,0,19.0,10049.0,1.0,22.598761,2.0,17.0,372.0,5.0,2869.0
1,1,39.0,31678.0,3.0,15.569731,1.0,12.0,694.0,2.0,1483.0
2,2,23.0,25602.0,3.0,47.177549,1.0,14.0,,3.0,567.0
3,3,21.0,141855.0,2.0,10.938144,1.0,0.0,367.0,1.0,765.0
4,4,21.0,39651.0,1.0,20.376094,0.0,8.0,598.0,4.0,2022.0
...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,27316.0,0.0,13.772907,,5.0,372.0,3.0,1303.0
1199996,1199996,54.0,35786.0,,11.483482,,10.0,597.0,4.0,821.0
1199997,1199997,19.0,51884.0,0.0,14.724469,0.0,19.0,,6.0,371.0
1199998,1199998,55.0,,1.0,18.547381,1.0,7.0,407.0,4.0,596.0


## 2) DATA PREPROCESSING

### 2.1) HANDLING MISSING VALUES

### FILL MISSING VALUES IN NUMERICAL COLUMNS WITH THE MEDIAN

In [25]:
train_data.fillna(train_data.median(numeric_only=True), inplace=True)

In [26]:
test_data.fillna(test_data.median(numeric_only=True), inplace=True)

### FILL MISSING VALUES IN CATEGORICAL COLUMNS WITH THE MODE

In [27]:
train_data.fillna(train_data.mode().iloc[0], inplace=True)

In [28]:
test_data.fillna(test_data.mode().iloc[0], inplace=True)

In [29]:
train_data.isnull().mean()

Unnamed: 0,0
id,0.0
Age,0.0
Gender,0.0
Annual Income,0.0
Marital Status,0.0
Number of Dependents,0.0
Education Level,0.0
Occupation,0.0
Health Score,0.0
Location,0.0


In [30]:
test_data.isnull().mean()

Unnamed: 0,0
id,0.0
Age,0.0
Gender,0.0
Annual Income,0.0
Marital Status,0.0
Number of Dependents,0.0
Education Level,0.0
Occupation,0.0
Health Score,0.0
Location,0.0


In [31]:
train_data.isnull().sum().sum()

0

In [32]:
test_data.isnull().sum().sum()

0

### 2.2) CONVERT CATEGORICAL VARIABLES TO NUMERICAL FORM

### INITIALIZE LABEL-ENCODER

In [33]:
label_encoder = LabelEncoder()

In [34]:
label_encoder

### CHECK FOR MISSING OR CORRUPT ENTRIES

In [35]:
print(train_data["Policy Start Date"].head(10))
print(train_data["Policy Start Date"].isna().sum())
print(train_data["Policy Start Date"].str.contains(":").sum())


0    2023-12-23 15:21:39.134960
1    2023-06-12 15:21:39.111551
2    2023-09-30 15:21:39.221386
3    2024-06-12 15:21:39.226954
4    2021-12-01 15:21:39.252145
5    2022-05-20 15:21:39.207847
6    2020-02-21 15:21:39.219432
7    2022-08-08 15:21:39.181605
8    2020-12-14 15:21:39.198406
9    2020-08-02 15:21:39.144722
Name: Policy Start Date, dtype: object
0
1200000


### REMOVE INCOMPLETE VALUES USING REGEX


In [36]:
train_data["Policy Start Date"] = train_data["Policy Start Date"].str.extract(r'(\d{4}-\d{2}-\d{2})')
test_data["Policy Start Date"] = test_data["Policy Start Date"].str.extract(r'(\d{4}-\d{2}-\d{2})')


### CONVERT TO DATETIME WHILE HANDLING ERRORS

In [37]:
train_data["Policy Start Date"] = pd.to_datetime(train_data["Policy Start Date"], errors='coerce')
test_data["Policy Start Date"] = pd.to_datetime(test_data["Policy Start Date"], errors='coerce')


### DROP ROWS

In [38]:
train_data.dropna(subset=["Policy Start Date"], inplace=True)
test_data.dropna(subset=["Policy Start Date"], inplace=True)


In [39]:
label_encoders = {}
cat_features = ["Customer Feedback", "Smoking Status", "Exercise Frequency", "Property Type"]

for col in cat_features:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    label_encoders[col] = le

train_data["Policy Start Date"] = pd.to_datetime(train_data["Policy Start Date"])
test_data["Policy Start Date"] = pd.to_datetime(test_data["Policy Start Date"])

train_data["Policy Year"] = train_data["Policy Start Date"].dt.year
train_data["Policy Month"] = train_data["Policy Start Date"].dt.month
train_data["Policy Day"] = train_data["Policy Start Date"].dt.day

test_data["Policy Year"] = test_data["Policy Start Date"].dt.year
test_data["Policy Month"] = test_data["Policy Start Date"].dt.month
test_data["Policy Day"] = test_data["Policy Start Date"].dt.day

train_data.drop(columns=["Policy Start Date"], inplace=True)
test_data.drop(columns=["Policy Start Date"], inplace=True)

In [40]:
label_encoder.fit_transform(train_data[col])

array([2, 2, 2, ..., 1, 0, 2])

In [41]:
label_encoder.fit_transform(test_data[col])

array([2, 0, 1, ..., 0, 1, 2])

In [42]:
categorical_cols = train_data.select_dtypes(include=['object']).columns

In [43]:
categorical_cols

Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type'],
      dtype='object')

In [44]:
categorical_cols = train_data.select_dtypes(include=['object']).columns
categorical_cols = categorical_cols[categorical_cols != "Policy Start Date"]

In [45]:
categorical_cols

Index(['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type'],
      dtype='object')

In [46]:
train_data.dtypes

Unnamed: 0,0
id,int64
Age,float64
Gender,object
Annual Income,float64
Marital Status,object
Number of Dependents,float64
Education Level,object
Occupation,object
Health Score,float64
Location,object


In [47]:
test_data.dtypes

Unnamed: 0,0
id,int64
Age,float64
Gender,object
Annual Income,float64
Marital Status,object
Number of Dependents,float64
Education Level,object
Occupation,object
Health Score,float64
Location,object


In [48]:
for col in ["Customer Feedback", "Smoking Status", "Exercise Frequency", "Property Type"]:
    print(f"{col}: {train_data[col].unique()}")

Customer Feedback: [2 0 1]
Smoking Status: [0 1]
Exercise Frequency: [3 1 0 2]
Property Type: [2 0 1]


In [49]:
for col in label_encoders:
    print(f"{col}: {dict(zip(label_encoders[col].classes_, range(len(label_encoders[col].classes_))))}")

Customer Feedback: {'Average': 0, 'Good': 1, 'Poor': 2}
Smoking Status: {'No': 0, 'Yes': 1}
Exercise Frequency: {'Daily': 0, 'Monthly': 1, 'Rarely': 2, 'Weekly': 3}
Property Type: {'Apartment': 0, 'Condo': 1, 'House': 2}


### 2.3)  SPLIT THE DATA INTO TRAINING AND EVALUATION SETS

In [50]:
X = train_data.drop(columns=['Premium Amount'])
y = train_data['Premium Amount']

In [51]:
X

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Policy Year,Policy Month,Policy Day
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,17.0,372.0,5.0,2,0,3,2,2023,12,23
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,Employed,15.569731,Rural,...,12.0,694.0,2.0,0,1,1,2,2023,6,12
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,14.0,595.0,3.0,1,1,3,2,2023,9,30
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,Employed,10.938144,Rural,...,0.0,367.0,1.0,2,1,0,0,2024,6,12
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,8.0,598.0,4.0,2,1,3,2,2021,12,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,...,5.0,372.0,3.0,2,0,0,0,2023,5,3
1199996,1199996,54.0,Male,35786.0,Divorced,2.0,Master's,Self-Employed,11.483482,Rural,...,10.0,597.0,4.0,2,0,3,0,2022,9,10
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,Employed,14.724469,Suburban,...,19.0,595.0,6.0,1,0,1,1,2021,5,25
1199998,1199998,55.0,Male,23911.0,Single,1.0,PhD,Employed,18.547381,Suburban,...,7.0,407.0,4.0,2,0,0,0,2021,9,19


In [52]:
y

Unnamed: 0,Premium Amount
0,2869.0
1,1483.0
2,567.0
3,765.0
4,2022.0
...,...
1199995,1303.0
1199996,821.0
1199997,371.0
1199998,596.0


In [53]:
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
X_train

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Policy Year,Policy Month,Policy Day
404339,404339,45.0,Male,91315.0,Divorced,3.0,Bachelor's,Employed,43.926883,Urban,...,1.0,311.0,2.0,2,0,2,2,2024,8,11
748487,748487,51.0,Female,5280.0,Married,1.0,PhD,Employed,41.854227,Rural,...,1.0,634.0,8.0,2,1,1,1,2020,6,20
435951,435951,48.0,Female,41318.0,Divorced,0.0,Master's,Employed,51.750192,Urban,...,12.0,759.0,9.0,2,1,2,1,2021,2,5
311284,311284,43.0,Female,30372.0,Divorced,0.0,PhD,Unemployed,20.666786,Urban,...,8.0,460.0,1.0,0,0,1,1,2022,11,11
318790,318790,18.0,Male,12891.0,Married,2.0,Bachelor's,Employed,36.995102,Urban,...,18.0,831.0,3.0,2,1,1,0,2021,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,110268,30.0,Female,2789.0,Single,4.0,High School,Employed,15.965779,Suburban,...,18.0,328.0,5.0,1,0,2,0,2020,10,8
259178,259178,35.0,Male,72512.0,Divorced,2.0,Bachelor's,Employed,23.914243,Urban,...,16.0,338.0,9.0,2,0,0,2,2022,1,15
131932,131932,43.0,Male,85463.0,Married,0.0,Bachelor's,Employed,25.151808,Urban,...,8.0,849.0,3.0,0,1,0,1,2023,9,30
671155,671155,49.0,Male,5908.0,Divorced,0.0,Bachelor's,Employed,41.637977,Rural,...,2.0,444.0,1.0,1,1,1,1,2019,10,11


In [55]:
X_eval

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Vehicle Age,Credit Score,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Policy Year,Policy Month,Policy Day
372605,372605,52.0,Female,5538.0,Married,2.0,Master's,Employed,8.093145,Rural,...,18.0,497.0,3.0,0,0,3,0,2020,2,10
551204,551204,60.0,Female,39711.0,Married,4.0,Bachelor's,Employed,24.578648,Suburban,...,3.0,340.0,3.0,2,0,3,0,2020,10,16
240320,240320,58.0,Female,2364.0,Married,2.0,Bachelor's,Employed,34.452482,Urban,...,19.0,595.0,7.0,2,0,1,2,2021,1,2
1047361,1047361,39.0,Male,9094.0,Divorced,0.0,Bachelor's,Employed,26.241661,Rural,...,17.0,761.0,5.0,2,1,2,1,2022,5,11
555362,555362,47.0,Male,43189.0,Divorced,2.0,PhD,Employed,26.691921,Suburban,...,4.0,319.0,4.0,2,1,3,0,2020,7,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458271,458271,46.0,Male,32875.0,Divorced,1.0,Bachelor's,Employed,32.140638,Urban,...,9.0,627.0,6.0,1,1,1,2,2024,3,23
1124744,1124744,39.0,Male,46820.0,Single,1.0,PhD,Employed,6.990761,Suburban,...,14.0,614.0,5.0,0,0,0,0,2021,9,5
720454,720454,18.0,Female,27596.0,Single,1.0,Bachelor's,Self-Employed,14.538865,Rural,...,6.0,678.0,3.0,2,1,1,1,2024,7,21
256458,256458,30.0,Male,110140.0,Divorced,0.0,Bachelor's,Employed,24.578648,Rural,...,8.0,524.0,6.0,0,0,0,2,2020,1,11


In [56]:
y_train

Unnamed: 0,Premium Amount
404339,2386.0
748487,285.0
435951,654.0
311284,1095.0
318790,1334.0
...,...
110268,538.0
259178,819.0
131932,403.0
671155,542.0


In [57]:
y_eval

Unnamed: 0,Premium Amount
372605,2742.0
551204,1347.0
240320,2196.0
1047361,684.0
555362,1714.0
...,...
458271,1286.0
1124744,24.0
720454,971.0
256458,562.0


In [58]:
print(X_train.shape, X_eval.shape)
print(y_train.shape, y_eval.shape)

(960000, 22) (240000, 22)
(960000,) (240000,)


### 2.4) CONVERT CATEGORICAL TO NUMERICAL FORM

In [59]:
print("X_train Data Types:\n", X_train.dtypes)

X_train Data Types:
 id                        int64
Age                     float64
Gender                   object
Annual Income           float64
Marital Status           object
Number of Dependents    float64
Education Level          object
Occupation               object
Health Score            float64
Location                 object
Policy Type              object
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Customer Feedback         int64
Smoking Status            int64
Exercise Frequency        int64
Property Type             int64
Policy Year               int32
Policy Month              int32
Policy Day                int32
dtype: object


In [60]:
print("X_eval Data Types:\n", X_eval.dtypes)

X_eval Data Types:
 id                        int64
Age                     float64
Gender                   object
Annual Income           float64
Marital Status           object
Number of Dependents    float64
Education Level          object
Occupation               object
Health Score            float64
Location                 object
Policy Type              object
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Customer Feedback         int64
Smoking Status            int64
Exercise Frequency        int64
Property Type             int64
Policy Year               int32
Policy Month              int32
Policy Day                int32
dtype: object


### CONVERT "GENDER" TO NUMERIC VALUES

In [61]:
X_eval['Gender'] = X_eval['Gender'].astype(str)

In [62]:
X_eval['Gender']

Unnamed: 0,Gender
372605,Female
551204,Female
240320,Female
1047361,Male
555362,Male
...,...
458271,Male
1124744,Male
720454,Female
256458,Male


In [63]:
gender_map = {'Female': 0, 'Male': 1}

X_train['Gender'] = X_train['Gender'].map(gender_map)
X_eval['Gender'] = X_eval['Gender'].map(gender_map)

### LABEL ENCODING

In [64]:
label_encoder = LabelEncoder()

X_train['Gender'] = label_encoder.fit_transform(X_train['Gender'])
X_eval['Gender'] = label_encoder.transform(X_eval['Gender'])

print("Unique values in X_train['Gender']:", X_train['Gender'].unique())
print("Unique values in X_eval['Gender']:", X_eval['Gender'].unique())

Unique values in X_train['Gender']: [1 0]
Unique values in X_eval['Gender']: [0 1]


In [65]:
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_eval[col] = le.transform(X_eval[col])

print("Final X_train Data Types:\n", X_train.dtypes)
print("Final X_eval Data Types:\n", X_eval.dtypes)

Final X_train Data Types:
 id                        int64
Age                     float64
Gender                    int64
Annual Income           float64
Marital Status            int64
Number of Dependents    float64
Education Level           int64
Occupation                int64
Health Score            float64
Location                  int64
Policy Type               int64
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Customer Feedback         int64
Smoking Status            int64
Exercise Frequency        int64
Property Type             int64
Policy Year               int32
Policy Month              int32
Policy Day                int32
dtype: object
Final X_eval Data Types:
 id                        int64
Age                     float64
Gender                    int64
Annual Income           float64
Marital Status            int64
Number of Dependents    float64
Education Level           int64
Occup

## 3) MODEL DEVELOPMENT

### 3.1) CHOOSE REGRESSION MODELS

### 3.1.1) TRAIN AND EVALUATE LINEAR REGRESSION MODEL

### INITIALIZE LINEAR-REGRESSION

In [None]:
linear_model = LinearRegression()

In [None]:
linear_model

### TRAIN THE MODEL

In [None]:
linear_model.fit(X_train, y_train)

### MAKE PREDICTIONS

In [None]:
y_pred = linear_model.predict(X_eval)

In [None]:
y_pred

array([1075.13723446, 1102.51333794, 1154.7243323 , ..., 1022.95763946,
       1086.29255632, 1101.21550951])

### EVALUATE THE MODEL

In [None]:
rmse = np.sqrt(mean_squared_error(y_eval, y_pred))
mae = mean_absolute_error(y_eval, y_pred)
r2 = r2_score(y_eval, y_pred)

In [None]:
print("📌 **Linear Regression Results:**")
print(f"🔹 RMSE: {rmse:.4f}")
print(f"🔹 MAE: {mae:.4f}")
print(f"🔹 R² Score: {r2:.4f}")

📌 **Linear Regression Results:**
🔹 RMSE: 863.2714
🔹 MAE: 667.2796
🔹 R² Score: 0.0027


### 3.1.2) TRAIN AND EVALUATE DECISION TREE MODEL

### INITIALIZE DECISION-TREE

In [None]:
tree_model = DecisionTreeRegressor(
    max_depth=7,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)

In [None]:
tree_model

### TRAIN THE MODEL

In [None]:
tree_model.fit(X_train, y_train)

### MAKE PREDICTIONS

In [None]:
y_pred_tree = tree_model.predict(X_eval)

In [None]:
y_pred_tree

array([1098.06183253, 1180.56870086, 1131.62388654, ..., 1065.55193936,
        851.37219751, 1065.55193936])

### EVALUATE THE MODEL

In [None]:
rmse_tree = np.sqrt(mean_squared_error(y_eval, y_pred_tree))
mae_tree = mean_absolute_error(y_eval, y_pred_tree)
r2_tree = r2_score(y_eval, y_pred_tree)

In [None]:
print("📌 **Decision Tree Regression Results:**")
print(f"🔹 RMSE: {rmse_tree:.4f}")
print(f"🔹 MAE: {mae_tree:.4f}")
print(f"🔹 R² Score: {r2_tree:.4f}")

📌 **Decision Tree Regression Results:**
🔹 RMSE: 848.4685
🔹 MAE: 645.5499
🔹 R² Score: 0.0366


### 3.1.3) TRAIN AND EVALUATE RANDOM FOREST REGRESSOR

### INITIALIZE RANDOM FOREST

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
rf_model

### TRAIN THE MODEL

In [None]:
rf_model.fit(X_train, y_train)

### FEATURE IMPORTANCE

In [None]:
final_features = X_train.columns

feature_importance = pd.DataFrame({
    'Feature': final_features,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance.head(10))


               Feature  Importance
8         Health Score    0.121906
3        Annual Income    0.119410
0                   id    0.106458
13        Credit Score    0.093029
1                  Age    0.075948
21          Policy Day    0.069136
12         Vehicle Age    0.060383
20        Policy Month    0.048996
14  Insurance Duration    0.043125
19         Policy Year    0.037201


### FEATURE SELECTION

In [None]:
selected_features = feature_importance["Feature"].head(10).tolist()
X_train_selected = X_train[selected_features]
X_evall_selected = X_eval[selected_features]


### MAKE PREDICTIONS

In [None]:
y_pred_rf = rf_model.predict(X_eval)

In [None]:
y_pred_rf

array([1328.46, 1190.83, 1470.97, ...,  992.21, 1583.91, 1251.64])

### EVALUATE THE MODEL

In [None]:
rmse = np.sqrt(mean_squared_error(y_eval, y_pred_rf))
mae = mean_absolute_error(y_eval, y_pred_rf)
r2 = r2_score(y_eval, y_pred_rf)

In [None]:
print(f"📌 **Random Forest Regression Results:**")
print(f"🔹 RMSE: {rmse:.4f}")
print(f"🔹 MAE: {mae:.4f}")
print(f"🔹 R² Score: {r2:.4f}")

📌 **Random Forest Regression Results:**
🔹 RMSE: 852.9248
🔹 MAE: 655.7569
🔹 R² Score: 0.0265


### 3.1.4) TRAIN AND EVALUATE XGBOOST REGRESSOR

### INITIALIZE XGBOOST

In [127]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    max_depth=15,
    learning_rate=0.02,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=3,
    reg_alpha=1,
    random_state=77
)

In [67]:
xgb_model

### TRAIN THE MODEL

In [68]:
xgb_model.fit(X_train, y_train)

### MAKE PREDICTIONS

In [77]:
y_pred_xgb = xgb_model.predict(X_eval)

In [78]:
y_pred_xgb

array([1572.1481, 1259.4039, 1551.0985, ...,  922.3302, 1621.8533,
       1093.6637], dtype=float32)

### EVALUATE THE MODEL

In [79]:
rmse = np.sqrt(mean_squared_error(y_eval, y_pred_xgb))
mae = mean_absolute_error(y_eval, y_pred_xgb)
r2 = r2_score(y_eval, y_pred_xgb)

In [80]:
print(f"📌 **XGBRegressor Results:**")
print(f"🔹 RMSE: {rmse:.4f}")
print(f"🔹 MAE: {mae:.4f}")
print(f"🔹 R² Score: {r2:.4f}")

📌 **XGBRegressor Results:**
🔹 RMSE: 848.8712
🔹 MAE: 644.7293
🔹 R² Score: 0.0357


## 4) ML PIPELINE & MLFLOW INTEGRATION

### 4.1) ML PIPELINE

In [99]:
pipeline = Pipeline([
    ('model', XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42))
])

In [100]:
pipeline

In [101]:
pipeline.fit(X_train, y_train)

In [104]:
pipeline_prediction = pipeline.predict(X_eval)

In [105]:
pipeline_prediction

array([1193.758 , 1214.3677, 1152.0289, ..., 1062.3887, 1296.7191,
       1045.9384], dtype=float32)

In [110]:
pipeline_mae = mean_absolute_error(y_eval, pipeline_prediction)

In [111]:
pipeline_mae

647.2401719552358

In [112]:
pipeline_rmse = np.sqrt(mean_squared_error(y_eval, pipeline_prediction))

In [113]:
pipeline_rmse

846.020294424748

In [114]:
pipeline_r2 = r2_score(y_eval, pipeline_prediction)

In [115]:
pipeline_r2

0.04219972675959105

### 4.2) TRACK EXPERIMENTS WITH MLFLOW

In [116]:
mlflow.log_param('model_type', 'XGBoost')

'XGBoost'

In [117]:
mlflow.log_metric('rmse', pipeline_rmse)

In [118]:
mlflow.log_metric('mae', pipeline_mae)

In [119]:
mlflow.log_metric('r2_score', pipeline_r2)

In [121]:
input_example = X_eval[:1]
mlflow.sklearn.log_model(pipeline, 'xgb_model_pipeline', input_example=input_example)

<mlflow.models.model.ModelInfo at 0x78921f8dcf90>

In [122]:
print("XGBoost Pipeline Results:")
print(f"RMSE: {pipeline_rmse}\\nMAE: {pipeline_mae}\\nR² Score: {pipeline_r2}")

XGBoost Pipeline Results:
RMSE: 846.020294424748\nMAE: 647.2401719552358\nR² Score: 0.04219972675959105


## 5) MODEL DEPLOYMENT WITH STREAMLIT