In [3]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.1 (from mlflow)
  Downloading mlflow_skinny-3.1.1-py3-none-any.whl.metadata (30 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting waitress<4 (from mlflow)
  Downloading waitress-3.0.2-py3-none-any.whl.metadata (5.8 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading databricks_sdk-0.59.0-py3-none-any.whl.metadata (39 kB)
Collecting fastapi<1 (from mlflow-skinny==3.1.1->mlflow)
  Downloading fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting opentelemetry-api<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloading opentelemetry_api-1.35.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==3.1.1->mlflow)
  Downloadi

In [1]:
# ================================
# 📘 Insurance Premium Prediction – Linear Regression Version
# ================================

import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load data
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# 2. Define features
numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

# 3. Add simple feature engineering
df["Income_per_Dependent"] = df["Annual Income"] / (df["Number of Dependents"] + 1)
df["Claims_per_Year"] = df["Previous Claims"] / df["Insurance Duration"].replace(0, 1)
numerical_cols += ["Income_per_Dependent", "Claims_per_Year"]

# 4. Prepare target (log transform)
target_col = "Premium Amount"
X = df[numerical_cols + categorical_cols]
y = np.log1p(df[target_col])

# 5. Preprocessing
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# 6. Train/test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Linear regression pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

# 8. MLflow experiment
mlflow.set_experiment("InsurancePremiumPrediction_Linear")

with mlflow.start_run():

    # Fit model
    model_pipeline.fit(X_train, y_train)

    # Predict
    y_pred_log = model_pipeline.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_original = np.expm1(y_val)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_val_original, y_pred))
    mae = mean_absolute_error(y_val_original, y_pred)
    r2 = r2_score(y_val_original, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val_original), np.log1p(np.maximum(y_pred, 0))))

    # Log results
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    # Save model
    joblib.dump(model_pipeline, "linear_model.pkl")
    mlflow.sklearn.log_model(model_pipeline, "linear_model")

# 9. Output metrics
print("✅ Linear Regression Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

# ===============================
# 📦 Predict on Unlabeled Test Set
# ===============================
test_unlabeled = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/test.csv")
test_unlabeled = test_unlabeled.drop(columns=["Policy Start Date", "Customer Feedback"])

# Add features
test_unlabeled["Income_per_Dependent"] = test_unlabeled["Annual Income"] / (test_unlabeled["Number of Dependents"] + 1)
test_unlabeled["Claims_per_Year"] = test_unlabeled["Previous Claims"] / test_unlabeled["Insurance Duration"].replace(0, 1)

# Predict
X_unlabeled = test_unlabeled[numerical_cols + categorical_cols]
test_log_preds = model_pipeline.predict(X_unlabeled)
test_preds = np.expm1(test_log_preds)

# Save predictions
submission = pd.DataFrame({
    "Customer_ID": test_unlabeled.get("Customer_ID", range(len(test_unlabeled))),
    "Predicted_Premium_Amount": test_preds
})
submission.to_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/submission_Linear_regression.csv", index=False)

print("📄 submission.csv with Linear Regression predictions saved successfully!")


2025/07/27 10:59:26 INFO mlflow.tracking.fluent: Experiment with name 'InsurancePremiumPrediction_Linear' does not exist. Creating a new experiment.


✅ Linear Regression Evaluation Results:
RMSE: 940.14
MAE: 648.04
R² Score: -0.1828
RMSLE: 1.0892
📄 submission.csv with Linear Regression predictions saved successfully!


In [7]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------------------------------------ --- 1.3/1.5 MB 7.5 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 6.4 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [9]:
# =============================================
# 📘 Insurance Premium Prediction ML Pipeline
# =============================================

# 1️⃣ Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# 2️⃣ Load the training dataset
# Make sure this file contains the target column 'Premium Amount'
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")

# 3️⃣ Drop non-numeric/non-informative columns for now
# 'Policy Start Date' is text and poorly formatted
# 'Customer Feedback' is unstructured text; needs NLP (optional)
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# 4️⃣ Define feature columns
# These are manually selected based on your dataset description

numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

target_col = "Premium Amount"

# 5️⃣ Separate features (X) and target (y)
X = df[numerical_cols + categorical_cols]
y = df[target_col]

# 6️⃣ Preprocessing Pipelines
# Numerical pipeline: Fill missing values with mean and standardize
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

# Categorical pipeline: Fill missing with most frequent and encode
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Combine both into a ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# 7️⃣ Split into training and validation sets (80/20)
# This helps us test how the model performs on unseen data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 8️⃣ Create a complete ML pipeline with preprocessing + model
# Replaced RandomForest with LightGBM in the pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LGBMRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])

# 9️⃣ Start MLflow to track experiment
mlflow.set_experiment("InsurancePremiumPrediction")

with mlflow.start_run():

    # Train the model
    model_pipeline.fit(X_train, y_train)

    # Predict on validation set
    y_pred = model_pipeline.predict(X_val)

    # 10️⃣ Calculate Evaluation Metrics

    # Root Mean Squared Error
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))

    # Mean Absolute Error
    mae = mean_absolute_error(y_val, y_pred)

    # R² Score (how much variance is explained)
    r2 = r2_score(y_val, y_pred)

    # RMSLE – Root Mean Squared Log Error
    # Useful for skewed targets like insurance costs
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val), np.log1p(y_pred)))

    # 11️⃣ Log all metrics and parameters to MLflow
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    # 12️⃣ Save model locally and log to MLflow
    joblib.dump(model_pipeline, "trained_model.pkl")  # Local file
    mlflow.sklearn.log_model(model_pipeline, "model")  # MLflow UI

# 13️⃣ Final evaluation output
print("✅ Model Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")


# ========================================
# 📦 14️⃣ Predict on Unlabeled Test Set
# ========================================

# Load the test data (no Premium Amount column)
test_unlabeled = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/test.csv")

# Drop unused/unstructured columns
test_unlabeled = test_unlabeled.drop(columns=["Policy Start Date", "Customer Feedback"])

# Select features only
X_unlabeled = test_unlabeled[numerical_cols + categorical_cols]

# Use trained pipeline to predict premium amounts
test_predictions = model_pipeline.predict(X_unlabeled)

# Create a new DataFrame with predictions
submission = pd.DataFrame({
    "Customer_ID": test_unlabeled.get("Customer_ID", range(len(test_unlabeled))),
    "Predicted_Premium_Amount": test_predictions
})

# Save to CSV
submission.to_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/submission_LGBM.csv", index=False)

print("📄 submission.csv with predictions saved successfully!")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 913
[LightGBM] [Info] Number of data points in the train set: 960000, number of used features: 35
[LightGBM] [Info] Start training from score 1102.505529




✅ Model Evaluation Results:
RMSE: 847.25
MAE: 646.20
R² Score: 0.0394
RMSLE: 1.1493
📄 submission.csv with predictions saved successfully!


In [3]:
 # ================================
# 📘 Insurance Premium Prediction – Decision Tree Version
# ================================

import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load data
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# 2. Define features
numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

# 3. Add feature engineering
df["Income_per_Dependent"] = df["Annual Income"] / (df["Number of Dependents"] + 1)
df["Claims_per_Year"] = df["Previous Claims"] / df["Insurance Duration"].replace(0, 1)
numerical_cols += ["Income_per_Dependent", "Claims_per_Year"]

# 4. Target variable with log transform
target_col = "Premium Amount"
X = df[numerical_cols + categorical_cols]
y = np.log1p(df[target_col])

# 5. Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# 6. Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Build pipeline with Decision Tree
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", DecisionTreeRegressor(max_depth=6, random_state=42))
])

# 8. Start MLflow experiment
mlflow.set_experiment("InsurancePremiumPrediction_DecisionTree")

with mlflow.start_run():

    # Train model
    model_pipeline.fit(X_train, y_train)

    # Predict
    y_pred_log = model_pipeline.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_original = np.expm1(y_val)

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_val_original, y_pred))
    mae = mean_absolute_error(y_val_original, y_pred)
    r2 = r2_score(y_val_original, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val_original), np.log1p(np.maximum(y_pred, 0))))

    # Log to MLflow
    mlflow.log_param("model_type", "DecisionTree")
    mlflow.log_param("max_depth", 6)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    # Save model
    joblib.dump(model_pipeline, "decision_tree_model.pkl")
    mlflow.sklearn.log_model(model_pipeline, "decision_tree_model")

# 9. Output metrics
print("✅ Decision Tree Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

# ===============================
# 📦 Predict on Unlabeled Test Set
# ===============================
test_unlabeled = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/test.csv")
test_unlabeled = test_unlabeled.drop(columns=["Policy Start Date", "Customer Feedback"])

# Add engineered features
test_unlabeled["Income_per_Dependent"] = test_unlabeled["Annual Income"] / (test_unlabeled["Number of Dependents"] + 1)
test_unlabeled["Claims_per_Year"] = test_unlabeled["Previous Claims"] / test_unlabeled["Insurance Duration"].replace(0, 1)

X_unlabeled = test_unlabeled[numerical_cols + categorical_cols]
test_log_preds = model_pipeline.predict(X_unlabeled)
test_preds = np.expm1(test_log_preds)

# Save predictions
submission = pd.DataFrame({
    "Customer_ID": test_unlabeled.get("Customer_ID", range(len(test_unlabeled))),
    "Predicted_Premium_Amount": test_preds
})
submission.to_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/submission_decision_tree.csv", index=False)

print("📄 submission.csv with Decision Tree predictions saved successfully!")


2025/07/27 11:01:21 INFO mlflow.tracking.fluent: Experiment with name 'InsurancePremiumPrediction_DecisionTree' does not exist. Creating a new experiment.


✅ Decision Tree Evaluation Results:
RMSE: 927.66
MAE: 625.68
R² Score: -0.1516
RMSLE: 1.0605
📄 submission.csv with Decision Tree predictions saved successfully!


In [5]:
# ================================
# 📘 Insurance Premium Prediction – Random Forest Version
# ================================

import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Load training dataset
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# 2. Define feature columns
numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

# 3. Feature engineering
df["Income_per_Dependent"] = df["Annual Income"] / (df["Number of Dependents"] + 1)
df["Claims_per_Year"] = df["Previous Claims"] / df["Insurance Duration"].replace(0, 1)
numerical_cols += ["Income_per_Dependent", "Claims_per_Year"]

# 4. Target and features
target_col = "Premium Amount"
X = df[numerical_cols + categorical_cols]
y = np.log1p(df[target_col])  # log transform to reduce skew

# 5. Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# 6. Train/test split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 7. Random Forest pipeline
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    ))
])

# 8. MLflow experiment
mlflow.set_experiment("InsurancePremiumPrediction_RandomForest")

with mlflow.start_run():

    # Fit model
    model_pipeline.fit(X_train, y_train)

    # Predict
    y_pred_log = model_pipeline.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_original = np.expm1(y_val)

    # Evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_val_original, y_pred))
    mae = mean_absolute_error(y_val_original, y_pred)
    r2 = r2_score(y_val_original, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val_original), np.log1p(np.maximum(y_pred, 0))))

    # Log to MLflow
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    # Save model
    joblib.dump(model_pipeline, "random_forest_model.pkl")
    mlflow.sklearn.log_model(model_pipeline, "random_forest_model")

# 9. Print results
print("✅ Random Forest Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

# ===============================
# 📦 Predict on Unlabeled Test Set
# ===============================
test_unlabeled = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/test.csv")
test_unlabeled = test_unlabeled.drop(columns=["Policy Start Date", "Customer Feedback"])

# Add same engineered features
test_unlabeled["Income_per_Dependent"] = test_unlabeled["Annual Income"] / (test_unlabeled["Number of Dependents"] + 1)
test_unlabeled["Claims_per_Year"] = test_unlabeled["Previous Claims"] / test_unlabeled["Insurance Duration"].replace(0, 1)

X_unlabeled = test_unlabeled[numerical_cols + categorical_cols]
test_log_preds = model_pipeline.predict(X_unlabeled)
test_preds = np.expm1(test_log_preds)

# Save predictions to CSV
submission = pd.DataFrame({
    "Customer_ID": test_unlabeled.get("Customer_ID", range(len(test_unlabeled))),
    "Predicted_Premium_Amount": test_preds
})
submission.to_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/submission_Random_forest.csv", index=False)

print("📄 submission.csv with Random Forest predictions saved successfully!")


2025/07/27 11:02:39 INFO mlflow.tracking.fluent: Experiment with name 'InsurancePremiumPrediction_RandomForest' does not exist. Creating a new experiment.


✅ Random Forest Evaluation Results:
RMSE: 927.45
MAE: 623.54
R² Score: -0.1511
RMSLE: 1.0563
📄 submission.csv with Random Forest predictions saved successfully!


In [9]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 1.3/150.0 MB 8.4 MB/s eta 0:00:18
    --------------------------------------- 2.9/150.0 MB 7.6 MB/s eta 0:00:20
   - -------------------------------------- 4.5/150.0 MB 7.4 MB/s eta 0:00:20
   - -------------------------------------- 6.3/150.0 MB 7.4 MB/s eta 0:00:20
   -- ------------------------------------- 7.6/150.0 MB 7.3 MB/s eta 0:00:20
   -- ------------------------------------- 9.2/150.0 MB 7.3 MB/s eta 0:00:20
   -- ------------------------------------- 10.7/150.0 MB 7.4 MB/s eta 0:00:19
   --- ------------------------------------ 12.3/150.0 MB 7.3 MB/s eta 0:00:19
   --- ------------------------------------ 13.9/150.0 MB 7.3 MB/s eta 0:00:19
   ---- ----------------------------------- 15.5/150.0 MB 7.3 MB/s eta 0:00:

In [11]:
# =============================================
# 📘 Insurance Premium Prediction – XGBoost Version
# =============================================

import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

# Load training dataset
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")

# Drop unstructured columns
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# Feature lists
numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

# Create new features (optional but recommended)
df["Income_per_Dependent"] = df["Annual Income"] / (df["Number of Dependents"] + 1)
df["Claims_per_Year"] = df["Previous Claims"] / df["Insurance Duration"].replace(0, 1)
numerical_cols += ["Income_per_Dependent", "Claims_per_Year"]

# Target variable
target_col = "Premium Amount"
X = df[numerical_cols + categorical_cols]
y = np.log1p(df[target_col])  # Apply log transformation

# Preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ML pipeline with placeholder model
xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", XGBRegressor(objective="reg:squarederror", random_state=42))
])

# Hyperparameter tuning
param_dist = {
    "regressor__n_estimators": [100, 200, 300],
    "regressor__learning_rate": [0.01, 0.05, 0.1],
    "regressor__max_depth": [3, 5, 7],
    "regressor__subsample": [0.6, 0.8, 1.0],
    "regressor__colsample_bytree": [0.6, 0.8, 1.0],
}

# RandomizedSearchCV setup
search = RandomizedSearchCV(
    xgb_pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    verbose=2,
    scoring="r2",
    n_jobs=-1,
    random_state=42
)

# Start MLflow experiment
mlflow.set_experiment("InsurancePremiumPrediction_XGBoost")

with mlflow.start_run():

    # Train + tune
    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    # Predict on validation
    y_pred_log = best_model.predict(X_val)
    y_pred = np.expm1(y_pred_log)  # Inverse log1p
    y_val_original = np.expm1(y_val)  # Inverse ground truth

    # Metrics
    rmse = np.sqrt(mean_squared_error(y_val_original, y_pred))
    mae = mean_absolute_error(y_val_original, y_pred)
    r2 = r2_score(y_val_original, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val_original), np.log1p(np.maximum(y_pred, 0))))

    # Log results
    mlflow.log_params(search.best_params_)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    # Save model
    joblib.dump(best_model, "best_xgb_model.pkl")
    mlflow.sklearn.log_model(best_model, "xgb_model")

# Output
print("✅ Model Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")

# ===============================
# 📦 Predict on Unlabeled Test Set
# ===============================
test_unlabeled = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/test.csv")
test_unlabeled = test_unlabeled.drop(columns=["Policy Start Date", "Customer Feedback"])

# Add same features
test_unlabeled["Income_per_Dependent"] = test_unlabeled["Annual Income"] / (test_unlabeled["Number of Dependents"] + 1)
test_unlabeled["Claims_per_Year"] = test_unlabeled["Previous Claims"] / test_unlabeled["Insurance Duration"].replace(0, 1)

# Predict
X_unlabeled = test_unlabeled[numerical_cols + categorical_cols]
test_log_preds = best_model.predict(X_unlabeled)
test_preds = np.expm1(test_log_preds)  # Inverse transform

# Save predictions
submission = pd.DataFrame({
    "Customer_ID": test_unlabeled.get("Customer_ID", range(len(test_unlabeled))),
    "Predicted_Premium_Amount": test_preds
})
submission.to_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/submission_XGBoost.csv", index=False)

print("📄 submission.csv with predictions saved successfully!")


2025/07/27 11:12:51 INFO mlflow.tracking.fluent: Experiment with name 'InsurancePremiumPrediction_XGBoost' does not exist. Creating a new experiment.


Fitting 3 folds for each of 20 candidates, totalling 60 fits




✅ Model Evaluation Results:
RMSE: 928.14
MAE: 624.06
R² Score: -0.1528
RMSLE: 1.0566
📄 submission.csv with predictions saved successfully!


In [5]:
import pandas as pd
import numpy as np
import joblib
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.models.signature import infer_signature

# Load data
df = pd.read_csv("C:/Users/Lenovo/Documents/Guvi/Project_3/train.csv")
df = df.drop(columns=["Policy Start Date", "Customer Feedback"])

# Define features
numerical_cols = [
    "Age", "Annual Income", "Number of Dependents", "Health Score",
    "Previous Claims", "Vehicle Age", "Credit Score", "Insurance Duration"
]

categorical_cols = [
    "Gender", "Marital Status", "Education Level", "Occupation",
    "Location", "Policy Type", "Smoking Status",
    "Exercise Frequency", "Property Type"
]

# Feature engineering
df["Income_per_Dependent"] = df["Annual Income"] / (df["Number of Dependents"] + 1)
df["Claims_per_Year"] = df["Previous Claims"] / df["Insurance Duration"].replace(0, np.nan)
df["Claims_per_Year"] = df["Claims_per_Year"].fillna(0)
numerical_cols += ["Income_per_Dependent", "Claims_per_Year"]

# Log transform skewed features
log_transform_features = ["Annual Income", "Credit Score", "Previous Claims"]
for col in log_transform_features:
    df[col] = np.log1p(df[col])

# Prepare target
target_col = "Premium Amount"
X = df[numerical_cols + categorical_cols]
y = np.log1p(df[target_col])

# Preprocessing
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42))
])

# Smaller randomized parameter grid
param_dist = {
    "regressor__n_estimators": [100, 150],
    "regressor__max_depth": [10, 15, 20],
    "regressor__min_samples_split": [2, 5],
    "regressor__max_features": ["sqrt", "log2"]
}

# RandomizedSearchCV
search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=8,
    cv=3,
    scoring="r2",
    random_state=42,
    n_jobs=-1,
    verbose=2
)

# Fit model
search.fit(X_train, y_train)
best_model = search.best_estimator_

# MLflow tracking
mlflow.set_experiment("InsurancePremiumPrediction_RF_Fast")

with mlflow.start_run():
    y_pred_log = best_model.predict(X_val)
    y_pred = np.expm1(y_pred_log)
    y_val_original = np.expm1(y_val)

    rmse = np.sqrt(mean_squared_error(y_val_original, y_pred))
    mae = mean_absolute_error(y_val_original, y_pred)
    r2 = r2_score(y_val_original, y_pred)
    rmsle = np.sqrt(mean_squared_error(np.log1p(y_val_original), np.log1p(np.maximum(y_pred, 0))))

    mlflow.log_params(search.best_params_)
    mlflow.log_param("model_type", "RandomForest_RandomizedSearch")
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("RMSLE", rmsle)

    signature = infer_signature(X_train, best_model.predict(X_train))
    mlflow.sklearn.log_model(best_model, name="rf_model_fast", signature=signature, input_example=X_train.iloc[:5])

    joblib.dump(best_model, "rf_model_fast.pkl")

# Print results
print("✅ FAST Random Forest Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")
print(f"RMSLE: {rmsle:.4f}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits



KeyboardInterrupt

