In [5]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Load dataset
df = pd.read_csv("health_insurance_dataset_updated.csv")

# Display first few rows to check data
print(df.head())

# Check for missing values
df.dropna(inplace=True)

# Rename columns to ensure consistency
df.columns = [col.strip() for col in df.columns]  # Trim spaces

# Define features and target variable
numerical_features = ["Age", "BMI", "No. of Hospital Visits", "Claim Amount Requested (₹)"]
categorical_features = ["Gender", "Smoke", "Health Problem"]

target_column = "Approval Status"

# Check if required columns exist
print("Columns in dataset:", df.columns.tolist())

if target_column not in df.columns:
    raise KeyError(f"Target column '{target_column}' not found in dataset. Please check column names: {df.columns.tolist()}")

# Ensure feature columns exist
missing_features = [col for col in numerical_features + categorical_features if col not in df.columns]
if missing_features:
    raise KeyError(f"Missing feature columns: {missing_features}. Please check dataset.")

# Assign features and target
y = df[target_column]
X = df[numerical_features + categorical_features]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create RandomForestClassifier model pipeline
model = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Save model
with open("hel_claim_model.pkl", "wb") as file:
    pickle.dump(model, file)

print("✅ Model trained and saved successfully!")

   Age  Gender   BMI Smoke Health Problem  No. of Hospital Visits  \
0   69  Female  26.3    No   Hypertension                      12   
1   32  Female  34.9   Yes       Diabetes                       5   
2   89    Male  29.8    No   Hypertension                       9   
3   78    Male  17.4    No  Heart Disease                       5   
4   38    Male  28.7    No   Hypertension                       2   

   Claim Amount Requested (₹) Approval Status  
0                      178681        Rejected  
1                      337712        Rejected  
2                      147053        Rejected  
3                       42978        Approved  
4                      301237        Rejected  
Columns in dataset: ['Age', 'Gender', 'BMI', 'Smoke', 'Health Problem', 'No. of Hospital Visits', 'Claim Amount Requested (₹)', 'Approval Status']
✅ Model trained and saved successfully!
