In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
import pickle

# Import your Preprocessor class
from data_preprocessing import preprocessing

# === CONFIGURATION ===
TRAINING_CSV = 'insuranceFraud.csv'  # <-- Change this to your actual training file
MODEL_DIR = 'models'
KMEANS_DIR = os.path.join(MODEL_DIR, 'KMeans')
N_CLUSTERS = 3  # <-- Set this to the number of clusters you want

# === STEP 1: Load and Preprocess Data ===
print("Loading data...")
data = pd.read_csv(TRAINING_CSV)

# Drop label and unnecessary columns (same as in prediction)
drop_cols = [
    'fraud_reported', 'policy_number','policy_bind_date','policy_state','insured_zip','incident_location',
    'incident_date','incident_state','incident_city','insured_hobbies','auto_make',
    'auto_model','auto_year','age','total_claim_amount'
]
data = data.drop([col for col in drop_cols if col in data.columns], axis=1)

# Replace '?' with np.nan
data.replace('?', np.nan, inplace=True)

# Preprocessing
preprocessor = preprocessing.Preprocessor(None, None)
is_null_present, cols_with_missing_values = preprocessor.is_null_present(data)
if is_null_present:
    data = preprocessor.impute_missing_values(data, cols_with_missing_values)

# Map policy_csl to numeric codes
if 'policy_csl' in data.columns:
    csl_map = {val: idx+1 for idx, val in enumerate(data['policy_csl'].unique())}
    print("policy_csl mapping:", csl_map)
    data['policy_csl'] = data['policy_csl'].map(csl_map)

# Encode all categorical columns
data = preprocessor.encode_categorical_columns(data)

# Drop columns with NaN if needed (optional, or set a threshold)
data = data.dropna(axis=1)

# Drop rows with NaN if any remain
data = data.dropna().reset_index(drop=True)

# Scale numerical columns
data = preprocessor.scale_numerical_columns(data)

print("Final data shape for KMeans:", data.shape)
print("Sample data for KMeans:\n", data.head())

# === STEP 2: Train KMeans ===
print("Training KMeans...")
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42)
kmeans.fit(data)

# === STEP 3: Save the KMeans Model ===
# Create KMeans directory if it doesn't exist
os.makedirs(KMEANS_DIR, exist_ok=True)

# Save the KMeans model
kmeans_path = os.path.join(KMEANS_DIR, 'KMeans.sav')
with open(kmeans_path, 'wb') as f:
    pickle.dump(kmeans, f)

print(f"KMeans model saved to {kmeans_path}")

# === STEP 4: Save cluster-specific models ===
# For each cluster, save a separate model file
for cluster in range(N_CLUSTERS):
    cluster_dir = os.path.join(KMEANS_DIR, f'cluster_{cluster}')
    os.makedirs(cluster_dir, exist_ok=True)
    
    # Save the same KMeans model for each cluster
    cluster_path = os.path.join(cluster_dir, 'model.sav')
    with open(cluster_path, 'wb') as f:
        pickle.dump(kmeans, f)
    
    print(f"Saved model for cluster {cluster} at {cluster_path}")

DEBUG: cols_with_missing_values = ['collision_type', 'authorities_contacted', 'property_damage', 'police_report_available']
DEBUG: data columns = Index(['months_as_customer', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_relationship',
       'capital-gains', 'capital-loss', 'incident_type', 'collision_type',
       'incident_severity', 'authorities_contacted',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'property_damage', 'bodily_injuries', 'witnesses',
       'police_report_available', 'injury_claim', 'property_claim',
       'vehicle_claim'],
      dtype='object')
DEBUG: data shape = (1000, 24)
DEBUG: imputing column collision_type type: <class 'str'>
DEBUG: imputing column authorities_contacted type: <class 'str'>
DEBUG: imputing column property_damage type: <class 'str'>
DEBUG: imputing column police_report_available type: <class 'str'

  data[column] = data[column].replace({'Y': 1, 'N': 0, 'YES': 1, 'NO': 0, 'Yes': 1, 'No': 0, 'yes': 1, 'no': 0})
