# Importing the Required Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

import numpy as np

import pickle

import mlflow.sklearn

import mlflow.tracking

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import logging

In [3]:
import os

print(os.getcwd())

c:\Users\sarav\Smart_Premium\Smart_premium_ML\Scripts


# Accessing Pipeline with test dataset

In [4]:
test_file = r"C:\Users\sarav\Smart_Premium\Smart_premium_ML\Research_Data\test.csv"

test_data = pd.read_csv(test_file)

test_data

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,1999995,50.0,Female,38782.0,Married,1.0,Bachelor's,,14.498639,Rural,Premium,,8.0,309.0,2.0,2021-07-09 15:21:39.184157,Average,Yes,Daily,Condo
799996,1999996,,Female,73462.0,Single,0.0,Master's,,8.145748,Rural,Basic,2.0,0.0,,2.0,2023-03-28 15:21:39.250151,Good,No,Daily,Apartment
799997,1999997,26.0,Female,35178.0,Single,0.0,Master's,Employed,6.636583,Urban,Comprehensive,,10.0,,6.0,2019-09-30 15:21:39.132191,Poor,No,Monthly,Apartment
799998,1999998,34.0,Female,45661.0,Single,3.0,Master's,,15.937248,Urban,Premium,2.0,17.0,467.0,7.0,2022-05-09 15:21:39.253660,Average,No,Weekly,Condo


In [5]:
test_data.drop(['id','Policy Start Date'], axis=1, inplace=True)

In [6]:
for col in test_data.select_dtypes(include=['int64', 'float64']).columns:
    test_data[col].fillna(test_data[col].mean(), inplace=True)

for col in test_data.select_dtypes(include='object').columns:
    test_data[col].fillna(test_data[col].mode()[0])

In [7]:
def age_category(data):
    if 18 < data <= 30: return '18-30'
    elif 30 < data <= 40: return '31-40'
    elif 40 < data <= 50: return '41-50'
    elif 50 < data <= 64: return '51-64'
    else: return '<64'

In [8]:
def dependent_category(data):
    if data == 0: return '0'
    elif 0 < data <= 2: return '0-2'
    elif 2 < data <= 3: return '2-3'
    else: return '<3'

In [9]:
def health_category(data):
    if 0 < data <= 15: return '0-15'
    elif 15 < data <= 25: return '15-25'
    elif 25 < data <= 35: return '15-35'
    else: return '<35'

In [10]:
def claims(data):
    if 0 < data <= 1: return '0-1'
    elif 1 < data <= 2: return '1-2'
    else: return '<2'

In [11]:
def vehicle(data):
    if 0 < data <= 5: return '0-5'
    elif 5 < data <= 10: return '5-10'
    elif 10 < data <= 20: return '10-20'
    else: return '<20'

In [12]:
def credit(data):
    if 0 < data <= 300: return '0-300'
    elif 300 < data <= 600: return '300-600'
    elif 600 < data < 800: return '600-800'
    else: return '<800'

In [13]:
def insurance(data):
    if 0 < data <= 3: return '0-3'
    elif 3 < data <= 6: return '3-6'
    elif 6 < data < 9: return '6-9'
    else: return '<9'

In [14]:
test_data['Age_Group'] = test_data['Age'].apply(age_category)
test_data['Dependent_Group'] = test_data['Number of Dependents'].apply(dependent_category)
test_data['Health_Group'] = test_data['Health Score'].apply(health_category)
test_data['Prev_Claims_Group'] = test_data['Previous Claims'].apply(claims)
test_data['Vehicle_Group'] = test_data['Vehicle Age'].apply(vehicle)
test_data['Credit_Group'] = test_data['Credit Score'].apply(credit)
test_data['Insurance_Group'] = test_data['Insurance Duration'].apply(insurance)

In [15]:
mappings = {
    "Education Level": {"High School": 0, "Bachelor's": 1, "Master's": 2, "PhD": 3},
    "Customer Feedback": {"Poor": 0, "Average": 1, "Good": 2},
    "Exercise Frequency": {"Rarely": 0, "Weekly": 1, "Monthly": 2, "Daily": 3},
    "Policy Type": {"Basic": 0, "Comprehensive": 1, "Premium": 2}
}

In [17]:
test_data = test_data.replace(mappings).infer_objects(copy=False)

test_data

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,...,Smoking Status,Exercise Frequency,Property Type,Age_Group,Dependent_Group,Health_Group,Prev_Claims_Group,Vehicle_Group,Credit_Group,Insurance_Group
0,28.00000,Female,2310.0,,4.0,1,Self-Employed,7.657981,Rural,0,...,Yes,1,House,18-30,<3,0-15,1-2,10-20,300-600,0-3
1,31.00000,Female,126031.0,Married,2.0,2,Self-Employed,13.381379,Suburban,2,...,Yes,0,Apartment,31-40,0-2,0-15,1-2,10-20,300-600,6-9
2,47.00000,Female,17092.0,Divorced,0.0,3,Unemployed,24.354527,Urban,1,...,Yes,2,Condo,41-50,0,15-25,1-2,10-20,<800,<9
3,28.00000,Female,30424.0,Divorced,3.0,3,Self-Employed,5.136225,Suburban,1,...,Yes,3,House,18-30,2-3,0-15,0-1,0-5,600-800,3-6
4,24.00000,Male,10863.0,Divorced,2.0,0,Unemployed,11.844155,Suburban,2,...,No,1,House,18-30,0-2,0-15,1-2,10-20,600-800,6-9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
799995,50.00000,Female,38782.0,Married,1.0,1,,14.498639,Rural,2,...,Yes,3,Condo,41-50,0-2,0-15,1-2,5-10,300-600,0-3
799996,41.13644,Female,73462.0,Single,0.0,2,,8.145748,Rural,0,...,No,3,Apartment,41-50,0,0-15,1-2,<20,300-600,0-3
799997,26.00000,Female,35178.0,Single,0.0,2,Employed,6.636583,Urban,1,...,No,2,Apartment,18-30,0,0-15,1-2,5-10,300-600,3-6
799998,34.00000,Female,45661.0,Single,3.0,2,,15.937248,Urban,2,...,No,1,Condo,31-40,2-3,15-25,1-2,10-20,300-600,6-9


In [18]:
columns_to_encode = test_data[['Age_Group', 'Dependent_Group', 'Health_Group', 'Prev_Claims_Group',
                               'Vehicle_Group', 'Credit_Group', 'Insurance_Group', 'Gender', 'Marital Status',
                               'Occupation', 'Location', 'Smoking Status', 'Property Type']]

In [19]:
le = LabelEncoder()
for col in columns_to_encode.columns:
    test_data[col] = le.fit_transform(test_data[col])
    

In [22]:
encoded_test_data = pd.DataFrame({
    'Age': test_data['Age_Group'],
    'Gender': test_data['Gender'],
    'Annual Income': test_data['Annual Income'],
    'Marital Status': test_data['Marital Status'],
    'Number of Dependents': test_data['Dependent_Group'],
    'Education Level': test_data['Education Level'],
    'Occupation': test_data['Occupation'],
    'Health Score': test_data['Health_Group'],
    'Location': test_data['Location'],
    'Policy Type': test_data['Policy Type'],
    'Previous Claims': test_data['Prev_Claims_Group'],
    'Vehicle Age': test_data['Vehicle_Group'],
    'Credit Score': test_data['Credit_Group'],
    'Insurance Duration': test_data['Insurance_Group'],
    'Customer Feedback': test_data['Customer Feedback'],
    'Smoking Status': test_data['Smoking Status'],
    'Exercise Frequency': test_data['Exercise Frequency'],
    'Property Type': test_data['Property Type']
})

In [23]:
def log_transform(data, columns_to_transform):
    for col in columns_to_transform:
        data[f'{col}_log'] = np.log1p(data[col])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True)  
    
    return data

In [24]:
transformed_data = log_transform(encoded_test_data, ['Annual Income'])

In [25]:
def scaling(data, columns_to_transform):
    scale = MinMaxScaler()
    for col in columns_to_transform:
        data[f'{col}_log'] = scale.fit_transform(data[[col]])
        data.drop(columns=[col], inplace=True)  
        data.rename(columns={f'{col}_log': col}, inplace=True) 

In [26]:
scaled_data = scaling(transformed_data, ['Annual Income'])

# Model Prediction

In [27]:
with open("C:\\Users\\sarav\\Smart_Premium\\Smart_premium_ML\\pickles\\best_model.pkl", "rb") as file:
    model = pickle.load(file)

model

In [28]:
# Load trained model

with open("C:\\Users\\sarav\\Smart_Premium\\Smart_premium_ML\\pickles\\best_model.pkl", "rb") as file:
    model = pickle.load(file)

# Retrieve expected feature names

expected_features = model.feature_names_in_  
print("Expected features:", expected_features)

Expected features: ['Age' 'Gender' 'Annual Income' 'Marital Status' 'Number of Dependents'
 'Education Level' 'Occupation' 'Health Score' 'Location' 'Policy Type'
 'Previous Claims' 'Vehicle Age' 'Credit Score' 'Insurance Duration'
 'Customer Feedback' 'Smoking Status' 'Exercise Frequency' 'Property Type']


In [29]:
# Ensure test data contains only the expected features

encoded_test_data = encoded_test_data[expected_features]

# Verify feature alignment

print("Test Data Columns (Before Alignment):", test_data.columns)
print("Test Data Columns (After Alignment):", encoded_test_data.columns)

Test Data Columns (Before Alignment): Index(['Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type', 'Age_Group',
       'Dependent_Group', 'Health_Group', 'Prev_Claims_Group', 'Vehicle_Group',
       'Credit_Group', 'Insurance_Group'],
      dtype='object')
Test Data Columns (After Alignment): Index(['Age', 'Gender', 'Annual Income', 'Marital Status',
       'Number of Dependents', 'Education Level', 'Occupation', 'Health Score',
       'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age',
       'Credit Score', 'Insurance Duration', 'Customer Feedback',
       'Smoking Status', 'Exercise Frequency', 'Property Type'],
      dtype='object')


In [30]:
# Identify missing features

missing_features = set(expected_features) - set(encoded_test_data.columns)
if missing_features:
    print("Missing features:", missing_features)
    for feature in missing_features:
        encoded_test_data[feature] = 0  

In [37]:
predictions = model.predict(encoded_test_data)

test_data['Predicted_Premium_Amount'] = predictions
test_data.to_csv("C:\\Users\\sarav\\Smart_Premium\\Smart_premium_ML\\Research_Data\\Test_Predictions_All.csv", index=False)


In [32]:
pickle_path = "C:\\Users\\sarav\\Smart_Premium\\Smart_premium_ML\\pickles\\test_predictions.pkl"
with open(pickle_path, "wb") as file:
    pickle.dump(predictions, file)

print('test_predictions.pkl saved successfully...')

test_predictions.pkl saved successfully...


In [29]:
# To start you mlflow server -> use "mlflow server --host localhost --port 5005" in your terminal  P.S: You always forget the Crucial step

# ML Flow 

In [33]:
import pandas as pd
import pickle
import mlflow.sklearn
import logging
import warnings

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

client = mlflow.tracking.MlflowClient()
client._request_max_retries = 5  
client._request_timeout = 60

predictions = model.predict(encoded_test_data).tolist()

pd.DataFrame(predictions, columns=["Predicted Premium"]).to_csv("Test_Predictions.csv", index=False)

logging.getLogger("mlflow").setLevel(logging.ERROR)

with mlflow.start_run():
    mlflow.sklearn.log_model(model, "Insurance_Premium_Model")
    mlflow.log_params({"model_name": model.__class__.__name__})
    mlflow.log_artifact("Test_Predictions.csv")
    
    for i in range(min(5, len(predictions))): 
        mlflow.log_metric(f"Predicted Premium {i}", predictions[i])

print("Predictions logged in MLflow and saved to Test_Predictions.csv.")

🏃 View run rare-sloth-474 at: http://127.0.0.1:5000/#/experiments/654137093132497434/runs/7cd392ba520c44cab1d25ecf1431e97e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654137093132497434
Predictions logged in MLflow and saved to Test_Predictions.csv.


# Model Registration

In [None]:
import mlflow
import mlflow.sklearn
import logging
import pandas as pd

# Set logging level to suppress unnecessary warnings

logging.getLogger("mlflow").setLevel(logging.ERROR)

# Set MLflow tracking URI

mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Set or create the experiment

experiment_name = "Smart_Premium_Experiment"
mlflow.set_experiment(experiment_name)

# Get experiment ID

experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id if experiment else mlflow.create_experiment(experiment_name)

# Ensure model and encoded_test_data are defined before running

predictions = model.predict(encoded_test_data).tolist()

# Save predictions to CSV

prediction_file = "Test_Predictions.csv"
pd.DataFrame(predictions, columns=["Predicted Premium"]).to_csv(prediction_file, index=False)

# Start an MLflow run

with mlflow.start_run(experiment_id=experiment_id, run_name="SmartPremium_Run") as run:
    
    # Log the model with automatic registration
    
    mlflow.sklearn.log_model(model, "Insurance_Premium_Model", registered_model_name="SmartPremiumModel")

    # Log parameters and artifacts
    
    mlflow.log_params({"model_name": model.__class__.__name__})
    mlflow.log_artifact(prediction_file)
    
    for i in range(min(5, len(predictions))): 
        mlflow.log_metric(f"Predicted Premium {i}", predictions[i])

print("MLflow run completed successfully! Predictions logged and saved to Test_Predictions.csv.")


Registered model 'SmartPremiumModel' already exists. Creating a new version of this model...
Created version '4' of model 'SmartPremiumModel'.


🏃 View run SmartPremium_Run at: http://127.0.0.1:5000/#/experiments/654137093132497434/runs/b9025d5ca35d417194c7ff296a904fef
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/654137093132497434
MLflow run completed successfully! Predictions logged and saved to Test_Predictions.csv.


In [None]:
import mlflow
import mlflow.sklearn
import logging
import pandas as pd

# Set logging level to suppress unnecessary warnings
logging.getLogger("mlflow").setLevel(logging.ERROR)

# Set MLflow tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Set or create the experiment
experiment_name = "Smart_Premium_Experiment"
mlflow.set_experiment(experiment_name)

# Get experiment ID
experiment = mlflow.get_experiment_by_name(experiment_name)
experiment_id = experiment.experiment_id if experiment else mlflow.create_experiment(experiment_name)

# Ensure model and encoded_test_data are defined before running
predictions = model.predict(encoded_test_data).tolist()

# Save predictions to CSV
prediction_file = "Test_Predictions.csv"
pd.DataFrame(predictions, columns=["Predicted Premium"]).to_csv(prediction_file, index=False)

# Start an MLflow run
with mlflow.start_run(experiment_id=experiment_id, run_name="SmartPremium_Run_1") as run:
    # Log the model with automatic registration
    mlflow.sklearn.log_model(model, "Insurance_Premium_Model", registered_model_name="SmartPremiumModel")

    # Log parameters and artifacts
    mlflow.log_params({"model_name": model.__class__.__name__})
    mlflow.log_artifact(prediction_file)
    
    for i in range(min(5, len(predictions))): 
        mlflow.log_metric(f"Predicted Premium {i}", predictions[i])

print("MLflow run completed successfully! Predictions logged and saved to Test_Predictions.csv.")
