# ANNEXE

In [None]:
# Check for correlation between risk caracteristics
claims_data[['VehPower', 'DrivAge', 'BonusMalus', 'Density', 'Exposure']].corr()

In [None]:
# Claim_counts holds historical claim frequency data
# Non_zero_claims holds historical non-zero claim severities

# Use the fitted parameters from the Negative Binomial and Log-normal fits
# For claim frequency (Negative Binomial)
params_negbin = [r_negbin, p_negbin]  # Fitted Negative Binomial params
# For claim severity (Log-normal)
params_lognorm = [shape, loc, scale]  # Fitted Log-normal params

# Number of simulations and policies
n_simulations = 1000
n_policies = len(claims_data)

# Pre-allocate an array for total losses
total_losses = np.zeros(n_simulations)

# Monte Carlo simulation loop
for i in range(n_simulations):
    # Simulate number of claims for all policies at once
    num_claims = nbinom.rvs(r_negbin, p_negbin, size=n_policies)
    
    # Initialize total claim severity for each policy
    total_claim_severity = np.zeros(n_policies)
    
    # For policies with claims, simulate the severity
    for j in range(n_policies):
        if num_claims[j] > 0:
            # Simulate claim severity for each claim and sum them for the policy
            total_claim_severity[j] = lognorm.rvs(shape, loc, scale, size=num_claims[j]).sum()
    
    # Store the total loss for this simulation
    total_losses[i] = total_claim_severity.sum()

# Calculate expected total loss from the simulation results 
expected_total_loss = np.mean(total_losses)

# Calculate Value-at-Risk (VaR) at different confidence levels (95% and 99%)
var_95 = np.percentile(total_losses, 95)
var_99 = np.percentile(total_losses, 99)

# Plot the total losses distribution and highlight VaR levels
plt.figure(figsize=(10, 6))
plt.hist(total_losses, bins=50, color='skyblue', alpha=0.7)
plt.axvline(var_95, color='red', linestyle='dashed', linewidth=2, label=f'VaR 95%: {var_95:.2f}')
plt.axvline(var_99, color='orange', linestyle='dashed', linewidth=2, label=f'VaR 99%: {var_99:.2f}')
plt.title('Distribution of Total Losses from Monte Carlo Simulation')
plt.xlabel('Total Losses')
plt.ylabel('Frequency')
plt.grid(True)
plt.legend()
plt.show()

# Printing Risk Measures 
print(f"Expected Total Loss: {expected_total_loss:.2f}")
print(f"95th Percentile Total Loss (VaR 95%): {var_95:.2f}")
print(f"99th Percentile Total Loss (VaR 99%): {var_99:.2f}")

In [None]:
from sklearn.pipeline import Pipeline

# Fix: Ensure df_1 is a correct copy of claims_data
df_1 = claims_data.copy()

# Columns to consider for clustering:
# We will treat 'Age Group', 'Vehicle Group', and 'Vehicle power' as categorical for now.
categorical_columns = ['Age Group', 'Vehicle Group', 'Vehicle power', 'Bonus / Malus', 'Area', 'Region']

# Step 1: Data Preprocessing using ColumnTransformer
# One-hot encode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical columns
    ])

# Step 2: Build the pipeline
# The pipeline will first preprocess the data and then apply KMeans clustering
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('kmeans', KMeans(n_clusters=3, random_state=42))])

# Step 3: Fit the pipeline to your data
pipeline.fit(df_1)

# Step 4: Predict clusters and store the result in 'Risk Cluster'
df_1['Risk Cluster'] = pipeline.named_steps['kmeans'].labels_

# Step 5: Analyze the clusters
# Count how many data points belong to each cluster
print(df_1['Risk Cluster'].value_counts())

# Optional: Display the DataFrame with the new 'Risk Cluster' column
print(df_1.head())

# Step 6: Cluster summary
# Since most columns are categorical, we will calculate frequency counts instead of mean
cluster_summary = df_1.groupby('Risk Cluster').size()

print("Cluster Summary (Cluster Size):")
cluster_summary

# Logistic Regression: Predict the probability of filing a claim (i.e., whether a claim is made or not)

## Goal: Understand which characteristics increase or decrease the likelihood of a policyholder filing a claim (frequency)

In [None]:
# Create binary variable for claims (1 if there was at least one claim, 0 otherwise)
claims_data['Claim_binary_variable'] = claims_data['ClaimNb'].apply(lambda x: 1 if x > 0 else 0)

# Convert categorical variables to dummy variables (e.g., Area, VehBrand, VehGas, Region)
claims_data = pd.get_dummies(claims_data, columns=['Area', 'VehBrand', 'VehGas', 'Region'], drop_first=True, dtype = int)

# Select the independent variables (Excluding IDpol and ClaimAmount)
independent_vars = ['Exposure', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']

# Adding dummy variables to independent_vars for policyholder's area, vehicle's brand, the type of gas used and for the region of the policyholder
independent_vars += [col for col in claims_data.columns if col.startswith('Area_') or col.startswith('VehBrand_') or col.startswith('VehGas_') or col.startswith('Region_')]

# Add constant term to the model
X = sm.add_constant(claims_data[independent_vars])

# Define the dependent variable
y = claims_data['Claim_binary_variable']

# Fit the logistic regression model
logit_model = sm.Logit(y, X).fit()

print(logit_model.summary())

In [None]:
# Filter the dataset for policyholders who made a claim
claims_with_claims = claims_data[claims_data['ClaimNb'] > 0]

# Log-transform the dependent variable (ClaimAmount)
claims_with_claims['log_ClaimAmount'] = np.log(claims_with_claims['ClaimAmount'])

# Define the independent variables (same as used for frequency)
independent_vars_severity = ['Exposure', 'VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
independent_vars_severity += [col for col in claims_with_claims.columns if col.startswith('Area_') or col.startswith('VehBrand_') or col.startswith('VehGas_') or col.startswith('Region_')]

# Add constant term
X_severity = sm.add_constant(claims_with_claims[independent_vars_severity])

# Dependent variable (log-transformed ClaimAmount)
y_severity = claims_with_claims['log_ClaimAmount']

# Fit the GLM for claim severity with a log-normal distribution and log link function 
# Gaussian family with a log link for modeling the log-normal distribution of claim amounts 
glm_severity = sm.GLM(y_severity, X_severity, family=sm.families.Gaussian(sm.families.links.log())).fit()

# View the summary of the severity model
print(glm_severity.summary())

#### Gaussian Mixture model for risk classification

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt
import seaborn as sns

# Définition des variables de caractéristiques et de la cible:
X = claims_data[['VehAge', 'DrivAge', 'BonusMalus', 'VehPower', 'Area', 'VehGas']] # X contient les caractéristiques observables des données d'assurance.

# Définition des caractéristiques catégorielles et numériques:
categorical_features = ['Area', 'VehGas']
numerical_features = ['VehAge', 'DrivAge', 'BonusMalus', 'VehPower']

# Création d'un pipeline de prétraitement: 
# ColumnTransformer applique StandardScaler aux caractéristiques numériques et OneHotEncoder aux caractéristiques catégorielles:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])

# Les transformations sont appliquées aux données d'entrée X pour obtenir X prétraité:
X_preprocessed = preprocessor.fit_transform(X)

# Applique un modèle de mélange gaussien avec 3 composantes, pour identifier les classes de risque latentes: 
gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(X_preprocessed)

# Les classes de risque latentes sont prédites et ajoutées aux données d'origine sous la colonne 'RiskClass':
claims_data['RiskClass'] = gmm.predict(X_preprocessed)

# Visualisation des classes de risque prédites:
sns.pairplot(claims_data, hue='RiskClass', vars=['VehAge', 'DrivAge', 'BonusMalus', 'VehPower'])
plt.show()

claims_data[['VehAge', 'DrivAge', 'BonusMalus', 'VehPower', 'Area', 'VehGas', 'RiskClass']]

In [None]:
# Ajouter les prédictions de cluster aux données d'origine
claims_data['RiskClass'] = gmm.predict(X_preprocessed)

# Sélectionner uniquement les colonnes numériques
numeric_columns = claims_data.select_dtypes(include=[np.number]).columns

# Calculer les statistiques descriptives pour chaque cluster en utilisant uniquement les colonnes numériques
cluster_summary = claims_data.groupby('RiskClass')[numeric_columns].describe()

cluster_summary

## Check to do a table in the report for % of characteristics in each cluster.

*Fréquences des caracteristiques dans chaque cluster (%Part de chaque caracteristiques)*

In [None]:
# Function to calculate percentage of categorical variables in each cluster
def calculate_cluster_percentages(df, cluster_num):
    cluster_data = df[df['Risk Cluster'] == cluster_num]
    percentages = {}
    for col in categorical_columns:
        percentages[col] = cluster_data[col].value_counts(normalize=True) * 100
    return pd.DataFrame(percentages)

# Calculate percentages for each cluster
cluster_0_percentages = calculate_cluster_percentages(claims_data, 0)
cluster_1_percentages = calculate_cluster_percentages(claims_data, 1)
cluster_2_percentages = calculate_cluster_percentages(claims_data, 2)

# Combine the results into a single DataFrame
combined_df = pd.concat([cluster_0_percentages, cluster_1_percentages, cluster_2_percentages], axis=1, keys=['Cluster 0', 'Cluster 1', 'Cluster 2'])

# Replace NaN values with 0
#combined_df = combined_df.fillna(0)

combined_df