In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

file_path = 'data_set.csv'
data = pd.read_csv(file_path, delimiter=';', on_bad_lines='skip')

features = ['Make', 'Model', 'Engine Power (HP)', 'Mileage (km)', 'Number of Accidents', 'Market Value ($)',
            'Total Owners', 'Has Dashcam', 'Vehicles in Family', 'Driving Experience', 'CAR_AGE', 'MVR_PTS',
            'AGE', 'HOMEKIDS', 'INCOME', 'TRAVTIME', 'CLM_FREQ']
insurance_types = ['Liability Insurance', 'Theft Insurance', 'Premium Insurance', 'Repair Insurance',
                   'Premium Repair Insurance']

if data['Has Dashcam'].dtype == 'object':
    data['Has Dashcam'] = data['Has Dashcam'].str.strip().str.lower().map({'true': 1, 'false': 0})

for col in features + insurance_types:
    if col in data.columns and data[col].dtype == 'object':
        data[col] = data[col].str.replace(',', '.').str.replace('[^0-9.]', '', regex=True)
        data[col] = pd.to_numeric(data[col], errors='coerce')

numeric_features = [col for col in features if col not in ['Make', 'Model']]
categorical_features = ['Make', 'Model']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X = data[features]
preprocessed_X = preprocessor.fit_transform(X)

results = {}
for insurance in insurance_types:
    y = data[insurance]

    X_train, X_test, y_train, y_test = train_test_split(preprocessed_X, y, test_size=0.2, random_state=42)

    model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_pred_clipped = np.clip(y_pred, y.min(), y.max())

    mse = mean_squared_error(y_test, y_pred_clipped)
    rmse = np.sqrt(mse)
    avg_diff = np.mean(y_pred_clipped - y_test)

    results[insurance] = {
        'MSE': mse,
        'RMSE': rmse,
        'Average Difference (Predicted - Real)': avg_diff
    }

for insurance, metrics in results.items():
    print(
        f"{insurance} - MSE: {metrics['MSE']:.2f}, RMSE: {metrics['RMSE']:.2f}, Average Difference (Predicted - Real): {metrics['Average Difference (Predicted - Real)']:.2f}")

single_entity = {
    'Make': 'Honda',
    'Model': 'Civic',
    'Engine Power (HP)': 202,
    'Mileage (km)': 40594,
    'Number of Accidents': 0,
    'Market Value ($)': 21440,
    'Total Owners': 1,
    'Has Dashcam': 1,
    'Vehicles in Family': 4,
    'Driving Experience': 3,
    'CAR_AGE': 18,
    'MVR_PTS': 3,
    'AGE': 60,
    'HOMEKIDS': 0,
    'INCOME': 67349,
    'TRAVTIME': 14,
    'CLM_FREQ': 2
}

single_entity_df = pd.DataFrame([single_entity])
single_entity_preprocessed = preprocessor.transform(single_entity_df)

single_entity_predictions = {}
for insurance in insurance_types:
    y = data[insurance]
    model = GradientBoostingRegressor(random_state=42, n_estimators=300, learning_rate=0.03, max_depth=7)
    model.fit(preprocessed_X, y)

    predicted_price = np.clip(model.predict(single_entity_preprocessed)[0], y.min(), y.max())
    single_entity_predictions[insurance] = predicted_price

print("Predicted Insurance Prices for Single Entity:")
for insurance, price in single_entity_predictions.items():
    print(f"{insurance}: {price:.2f}")


