In [57]:
# Step 1: Imports
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib
from transformers import pipeline




In [60]:
df = pd.read_csv("insurance_data.csv")
df.head()

Unnamed: 0,customer_id,age,vehicle_type,location,claim_history,base_rate,risk_multiplier
0,C001,58,Car,Urban,2,5001,1.6
1,C002,32,Car,Rural,0,5885,1.0
2,C003,65,Car,Rural,3,4565,1.5
3,C004,19,Car,Urban,1,5534,1.3
4,C005,56,Car,Rural,1,5966,1.2


In [62]:
# Encode categorical columns# Encode categorical columns
le_vehicle = LabelEncoder()
le_location = LabelEncoder()

df['vehicle_enc'] = le_vehicle.fit_transform(df['vehicle_type'])
df['location_enc'] = le_location.fit_transform(df['location'])


In [63]:
# Features and target
# Target: risk_multiplier (for simplicity, classify into Low, Medium, High)
# We'll create a risk category based on risk_multiplier
def risk_category(x):
    if x <= 1.2:
        return 'Low'
    elif x <= 1.5:
        return 'Medium'
    else:
        return 'High'


In [64]:
df['risk_label'] = df['risk_multiplier'].apply(risk_category)
le_risk = LabelEncoder()
df['risk_enc'] = le_risk.fit_transform(df['risk_label'])

X = df[['age','vehicle_enc','location_enc','claim_history','base_rate']]
y = df['risk_enc']

In [65]:
# Step 3: Train Risk Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [66]:
# Save model and encoders
joblib.dump(model, 'risk_model.pkl')
joblib.dump(le_vehicle, 'le_vehicle.pkl')
joblib.dump(le_location, 'le_location.pkl')
joblib.dump(le_risk, 'le_risk.pkl')

['le_risk.pkl']

In [73]:
# Using local HuggingFace model (no API key needed)
llm = pipeline("text-generation", model="tiiuae/falcon-7b-instruct", pad_token_id=0)
def llm_explain_local(customer, risk, premium):
    prompt = f"""
    Customer Profile:
    Age: {customer['age']}
    Vehicle: {customer['vehicle_type']}
    Location: {customer['location']}
    Claims History: {customer['claim_history']}

    Predicted Risk Level: {risk}
    Calculated Premium: ₹{premium}

    Explain in simple, customer-friendly terms why the premium is this amount.
    """
    # Use max_new_tokens, truncation, and pad_token_id
    return llm(prompt, max_new_tokens=200, truncation=True, pad_token_id=0)[0]['generated_text']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [70]:
# Step 5: Full Quote Pipeline
def full_quote_pipeline(customer):
    # Encode features
    input_df = pd.DataFrame([{
        "age": customer["age"],
        "vehicle_enc": le_vehicle.transform([customer["vehicle_type"]])[0],
        "location_enc": le_location.transform([customer["location"]])[0],
        "claim_history": customer["claim_history"],
        "base_rate": customer["base_rate"]
    }])
    # Predict risk
    risk_enc = model.predict(input_df)[0]
    risk = le_risk.inverse_transform([risk_enc])[0]

    # Calculate premium
    coverage_factor = {'Car': 1.2, 'Bike': 1.0}  # Adjust as needed
    premium = customer['base_rate'] * customer['risk_multiplier'] * coverage_factor[customer['vehicle_type']]

    # LLM explanation
    explanation = llm_explain_local(customer, risk, premium)

    return risk, premium, explanation


In [None]:
new_customer = {
    "age": 40,
    "vehicle_type": "Car",
    "location": "Urban",
    "claim_history": 1,
    "base_rate": 5500,
    "risk_multiplier": 1.4
}

risk, premium, explanation = full_quote_pipeline(new_customer)

print("Predicted Risk:", risk)
print("Final Premium: ₹", premium)
print("\nLLM Explanation:\n", explanation)