In [4]:
%pip install scikit-learn
%pip install pandas
%pip install skops

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting skops
  Downloading skops-0.11.0-py3-none-any.whl.metadata (6.0 kB)
Collecting huggingface-hub>=0.17.0 (from skops)
  Downloading huggingface_hub-0.27.0-py3-none-any.whl.metadata (13 kB)
Collecting tabulate>=0.8.8 (from skops)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting filelock (from huggingface-hub>=0.17.0->skops)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.17.0->skops)
  Downloading fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Collecting pyyaml>=5.1 (from huggingface-hub>=0.17.0->skops)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting requests (from huggingface-hub>=0.17.0->skops)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.42.1 (from huggingface-hub>=0.17.0->skops)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting typing-extensions>=3.7.4.3 (from huggingface-hub>=0.17.0


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import skops.io as sio

In [12]:
file_path = 'data_set.csv'
data = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')

features = ['Make', 'Model', 'Engine Power (HP)', 'Mileage (km)', 'Number of Accidents', 'Market Value ($)',
            'Total Owners', 'Has Dashcam', 'Vehicles in Family', 'Driving Experience', 'CAR_AGE',
            'AGE', 'HOMEKIDS', 'INCOME']
insurance_types = ['Liability Insurance', 'Theft Insurance', 'Premium Insurance', 'Repair Insurance',
                   'Premium Repair Insurance']

if data['Has Dashcam'].dtype == 'object':
    data['Has Dashcam'] = data['Has Dashcam'].str.strip().str.lower().map({'true': 1, 'false': 0})

for col in features + insurance_types:
    if col in data.columns and data[col].dtype == 'object':
        data[col] = data[col].str.replace(',', '.').str.replace('[^0-9.]', '', regex=True)
        data[col] = pd.to_numeric(data[col], errors='coerce')

numeric_features = [col for col in features if col not in ['Make', 'Model']]
categorical_features = ['Make', 'Model']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X = data[features]

preprocessor.fit(X)

preprocessed_X = preprocessor.transform(X)

results = {}
models = {}
for insurance in insurance_types:
    y = data[insurance]

    X_train, X_test, y_train, y_test = train_test_split(preprocessed_X, y, test_size=0.2, random_state=42)

    model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
    model.fit(X_train, y_train)

    models[insurance] = model

    y_pred = model.predict(X_test)
    y_pred_clipped = np.clip(y_pred, y.min(), y.max())

    mse = mean_squared_error(y_test, y_pred_clipped)
    rmse = np.sqrt(mse)
    avg_diff = np.mean(y_pred_clipped - y_test)

    results[insurance] = {
        'MSE': mse,
        'RMSE': rmse,
        'Average Difference (Predicted - Real)': avg_diff
    }

models['preprocessor'] = preprocessor

for insurance, metrics in results.items():
    print(
        f"{insurance} - MSE: {metrics['MSE']:.2f}, RMSE: {metrics['RMSE']:.2f}, Average Difference (Predicted - Real): {metrics['Average Difference (Predicted - Real)']:.2f}")



Liability Insurance - MSE: 427.42, RMSE: 20.67, Average Difference (Predicted - Real): -1.17
Theft Insurance - MSE: 59.42, RMSE: 7.71, Average Difference (Predicted - Real): 0.10
Premium Insurance - MSE: 46.51, RMSE: 6.82, Average Difference (Predicted - Real): 0.18
Repair Insurance - MSE: 41.43, RMSE: 6.44, Average Difference (Predicted - Real): -0.09
Premium Repair Insurance - MSE: 148.79, RMSE: 12.20, Average Difference (Predicted - Real): -0.29


In [13]:
sio.dump(models, 'model2.skops')

In [3]:
single_entity = {
    'Make': 'Honda',
    'Model': 'Civic',
    'Engine Power (HP)': 202,
    'Mileage (km)': 40594,
    'Number of Accidents': 0,
    'Market Value ($)': 21440,
    'Total Owners': 1,
    'Has Dashcam': 1,
    'Vehicles in Family': 4,
    'Driving Experience': 3,
    'CAR_AGE': 18,
    'AGE': 60,
    'HOMEKIDS': 0,
    'INCOME': 67349
}

single_entity_df = pd.DataFrame([single_entity])
single_entity_preprocessed = preprocessor.transform(single_entity_df)

single_entity_predictions = {}
for insurance in insurance_types:
    y = data[insurance]
    model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
    model.fit(preprocessed_X, y)

    predicted_price = np.clip(model.predict(single_entity_preprocessed)[0], y.min(), y.max())
    single_entity_predictions[insurance] = predicted_price

print("Predicted Insurance Prices for Single Entity:")
for insurance, price in single_entity_predictions.items():
    print(f"{insurance}: {price:.2f}")


NameError: name 'preprocessor' is not defined

In [7]:
single_entity_predictions = {}

for insurance in insurance_types:
    y = data[insurance]
    model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
    model.fit(preprocessed_X, y)

    predicted_price = np.clip(model.predict(single_entity_preprocessed)[0], y.min(), y.max())
    single_entity_predictions[insurance] = predicted_price

print("Predicted Insurance Prices for Single Entity:")
for insurance, price in single_entity_predictions.items():
    print(f"{insurance}: {price:.2f}")


NameError: name 'insurance_types' is not defined