<a href="https://colab.research.google.com/github/Sruthi-Reddy-B/Insurance_claim_prediction/blob/main/notebooks/eda_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Clone repo
!git clone https://github.com/Sruthi-Reddy-B/Insurance_claim_prediction.git
%cd Insurance_claim_prediction

In [None]:
#if git repo not cloned
'''import pandas as pd

# Load CSV from GitHub directly
url = "https://raw.githubusercontent.com/Sruthi-Reddy-B/Insurance_claim_prediction/main/data/sample_claims.csv"

data = pd.read_csv(url)
data.head()'''

!ls

In [None]:
# Install necessary packages
#!pip install -q pandas numpy scikit-learn matplotlib seaborn joblib
!pip install -r requirements.txt -q


# Insurance Claim Prediction — End-to-End ML Pipeline

## 1. Load Data

In [13]:
import pandas as pd

data = pd.read_csv('./data/sample_claims.csv')
data.head()

Unnamed: 0,Age,Vehicle_Age,Vehicle_Damage,Policy_Sales_Channel,Vintage,Previously_Claimed,Claim_Approved
0,25,2,Yes,152,150,0,1
1,45,5,No,26,300,1,0
2,30,1,Yes,152,200,0,1
3,50,3,No,152,100,1,0
4,35,2,Yes,26,250,0,1


## 2. Data Preprocessing

In [14]:

# Convert categorical variables
data['Vehicle_Damage'] = data['Vehicle_Damage'].map({'Yes':1,'No':0})

# Split features and target
X = data.drop('Claim_Approved', axis=1)
y = data['Claim_Approved']

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)



ValueError: The test_size = 1 should be greater or equal to the number of classes = 2

## 3. Train Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



## 4. Save Model


In [None]:
import joblib
import os

# Ensure src folder exists
os.makedirs('./src', exist_ok=True)

# Save model
joblib.dump(model, './src/claim_model.pkl')
print("Model saved as claim_model.pkl in ./src/")



## 5. Results Visualization

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix — Insurance Claim Prediction")
plt.show()

#6. Export results

In [None]:
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.title("Confusion Matrix — Insurance Claim Prediction")
plt.savefig('./results/confusion_matrix.png', bbox_inches='tight')
print("✅ Confusion matrix saved to ./results/confusion_matrix.png")