In [1]:
%pip install scikit-learn
%pip install pandas
%pip install skops
%pip install mlflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip






[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting mlflow
  Downloading mlflow-2.19.0-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.19.0 (from mlflow)
  Downloading mlflow_skinny-2.19.0-py3-none-any.whl.metadata (31 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Jinja2<4,>=3.0 (from mlflow)
  Downloading jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Collecting matplotlib<4 (from mlflow)
  Downloading matplotlib-3.10.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting pyarrow<19,>=4.0.0 (from mlflow)
  Downloading pyarrow-18.1.0-cp


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import skops.io as sio
import mlflow
from mlflow.models import infer_signature

In [9]:
file_path = 'data_set.csv'
data = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')

features = ['Make', 'Model', 'Engine Power (HP)', 'Mileage (km)', 'Number of Accidents', 'Market Value ($)',
            'Total Owners', 'Has Dashcam', 'Vehicles in Family', 'Driving Experience', 'CAR_AGE',
            'AGE', 'HOMEKIDS', 'INCOME']
insurance_types = ['Liability Insurance', 'Theft Insurance', 'Premium Insurance', 'Repair Insurance',
                   'Premium Repair Insurance']

if data['Has Dashcam'].dtype == 'object':
    data['Has Dashcam'] = data['Has Dashcam'].str.strip().str.lower().map({'true': 1, 'false': 0})

for col in features + insurance_types:
    if col in data.columns and data[col].dtype == 'object':
        data[col] = data[col].str.replace(',', '.').str.replace('[^0-9.]', '', regex=True)
        data[col] = pd.to_numeric(data[col], errors='coerce')

numeric_features = [col for col in features if col not in ['Make', 'Model']]
categorical_features = ['Make', 'Model']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X = data[features]

with mlflow.start_run():
    preprocessor.fit(X)

    preprocessed_X = preprocessor.transform(X)

    results = {}
    models = {}
    for insurance in insurance_types:
        y = data[insurance]

        X_train, X_test, y_train, y_test = train_test_split(preprocessed_X, y, test_size=0.2, random_state=42)

        model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
        model.fit(X_train, y_train)

        signature = infer_signature(X_train, model.predict(X_train))

        model_path = f"model_{insurance}"
        
        # export to mlflow
        mlflow.sklearn.log_model(
            model,
            model_path,
            signature=signature,
            registered_model_name=f"insurance_pricing_model_{insurance}",
        )
        # mlflow.set_tag("insurance_type", insurance)
        # models[insurance] = model

        # predict
        y_pred = model.predict(X_test)
        y_pred_clipped = np.clip(y_pred, y.min(), y.max())

        mse = mean_squared_error(y_test, y_pred_clipped)
        rmse = np.sqrt(mse)
        avg_diff = np.mean(y_pred_clipped - y_test)

        results[insurance] = {
            'MSE': mse,
            'RMSE': rmse,
            'Average Difference (Predicted - Real)': avg_diff
        }

    preprocessor_signature = infer_signature(X, preprocessed_X)

    mlflow.sklearn.log_model(
        preprocessor,
        "preprocessor",
        signature = 
    )

# models['preprocessor'] = preprocessor

for insurance, metrics in results.items():
    print(
        f"{insurance} - MSE: {metrics['MSE']:.2f}, RMSE: {metrics['RMSE']:.2f}, Average Difference (Predicted - Real): {metrics['Average Difference (Predicted - Real)']:.2f}")



inputs: 
  ['Make': string (required), 'Model': string (required), 'Engine Power (HP)': long (required), 'Mileage (km)': long (required), 'Number of Accidents': long (required), 'Market Value ($)': long (required), 'Total Owners': long (required), 'Has Dashcam': long (required), 'Vehicles in Family': long (required), 'Driving Experience': long (required), 'CAR_AGE': long (required), 'AGE': long (required), 'HOMEKIDS': long (required), 'INCOME': long (required)]
outputs: 
  ['Liability Insurance': double (required), 'Theft Insurance': double (required), 'Premium Insurance': double (required), 'Repair Insurance': double (required), 'Premium Repair Insurance': double (required)]
params: 
  None

Liability Insurance - MSE: 427.70, RMSE: 20.68, Average Difference (Predicted - Real): -1.17
Theft Insurance - MSE: 60.22, RMSE: 7.76, Average Difference (Predicted - Real): 0.09
Premium Insurance - MSE: 46.38, RMSE: 6.81, Average Difference (Predicted - Real): 0.19
Repair Insurance - MSE: 41.43, 

Successfully registered model 'insurance'.
Created version '1' of model 'insurance'.


In [13]:
sio.dump(models, 'model2.skops')