In [32]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [26]:
df = pd.read_csv("Data/Final_insurance_data.csv")
print(df.head())

   claim_id policy_start_date  claim_date       city policy_type  \
0  CL211801        2023-01-12  2024-06-24      Delhi      Travel   
1  CL196074        2024-10-01  2025-09-20  Ahmedabad      Travel   
2  CL280133        2023-08-22  2025-08-12  Ahmedabad      Travel   
3  CL019432        2024-09-17  2024-10-01    Kolkata        Home   
4  CL232246        2024-02-19  2024-10-01    Chennai        Home   

  sales_channel vehicle_type  customer_age  annual_premium  claim_amount  \
0       Partner      Unknown          36.0         17228.0       53497.0   
1         Agent      Unknown          47.0          8940.0       41607.0   
2           App      Unknown          18.0         10861.0       58176.0   
3         Agent      Unknown          36.0          3091.0       43317.0   
4         Agent      Unknown          38.0          6660.0       79835.0   

   past_claims_count  days_since_policy_start  documents_submitted  is_fraud  \
0                0.0                    239.0         

In [27]:
df = df.drop(columns=["claim_id","policy_start_date","claim_date"])
df = pd.get_dummies(
    df,
    columns=["city", "policy_type", "sales_channel","vehicle_type"],
    drop_first=True
)

In [28]:
X = df.drop(columns=["is_fraud"], errors="ignore")
y = df["is_fraud"]
X = X.astype(int)


In [33]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size= 0.2, random_state=42
)


In [34]:
# Train Logistic Regression
log_model = LogisticRegression(
    max_iter=5000,
    solver="liblinear"
)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = accuracy_score(y_test, log_pred)

# Train Random Forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_pred)

# Select best model
if rf_acc > log_acc:
    best_model_name = "Random Forest"
else:
    best_model_name = "Logistic Regression"

print("Best model:", best_model_name)

Best model: Random Forest
