In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
#  Load Data from Local Storage (For Colab)
from google.colab import files
uploaded = files.upload()



Saving insurance_claims.csv to insurance_claims (1).csv


In [None]:
# Read the CSV file (Replace 'insurance_claims.csv' with your actual filename)
filename = list(uploaded.keys())[0]  # Get uploaded file name dynamically
df = pd.read_csv(filename)


In [None]:
# Step 2: Display Basic Info About the Data
print("Dataset Overview:")
print(df.info())
print("\nMissing Values Per Column:\n", df.isnull().sum())



Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 39 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   months_as_customer           1000 non-null   int64  
 1   age                          1000 non-null   int64  
 2   policy_number                1000 non-null   int64  
 3   policy_bind_date             1000 non-null   object 
 4   policy_state                 1000 non-null   object 
 5   policy_csl                   1000 non-null   object 
 6   policy_deductable            1000 non-null   int64  
 7   policy_annual_premium        1000 non-null   float64
 8   umbrella_limit               1000 non-null   int64  
 9   insured_zip                  1000 non-null   int64  
 10  insured_sex                  1000 non-null   object 
 11  insured_education_level      1000 non-null   object 
 12  insured_occupation           1000 non-null   object 
 13  i

In [None]:
# Handle Missing Values in 'fraud_reported' Column
if "fraud_reported" in df.columns:
    print("\nBefore Handling Missing Values in fraud_reported:", df["fraud_reported"].isnull().sum())

    # Convert 'fraud_reported' from 'Y'/'N' to 1/0
    df["fraud_reported"] = df["fraud_reported"].map({"Y": 1, "N": 0})

    # Fill missing fraud_reported values with the most frequent value (mode)
    df["fraud_reported"].fillna(df["fraud_reported"].mode()[0], inplace=True)

    print("After Handling Missing Values in fraud_reported:", df["fraud_reported"].isnull().sum())




Before Handling Missing Values in fraud_reported: 0
After Handling Missing Values in fraud_reported: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["fraud_reported"].fillna(df["fraud_reported"].mode()[0], inplace=True)


In [None]:
# Drop Irrelevant Columns (Check if they exist before dropping)
drop_columns = ["policy_number", "policy_bind_date", "incident_date", "incident_location", "insured_zip"]
df = df.drop(columns=[col for col in drop_columns if col in df.columns])


In [None]:
# Separate Features (X) and Target (y)
if "fraud_reported" in df.columns:
    X = df.drop(columns=["fraud_reported"])
    y = df["fraud_reported"]
else:
    raise ValueError("Target column 'fraud_reported' not found in dataset!")

In [None]:
# Handle Categorical Data Dynamically
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

if categorical_cols:
    encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    encoded_cats = encoder.fit_transform(X[categorical_cols])
    encoded_cat_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

    # Drop original categorical columns and merge the encoded ones
    X = X.drop(columns=categorical_cols)
    X = pd.concat([X, encoded_cat_df], axis=1)

In [None]:
# Standardize Numeric Features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print("\nTraining Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)


Training Data Shape: (800, 161)
Testing Data Shape: (200, 161)


In [None]:
# Train and Evaluate SVM Model with Different Kernels
kernels = ["linear", "rbf", "poly", "sigmoid"]

for kernel in kernels:
    print(f"\n Training SVM with {kernel} kernel...\n")
    model = SVC(kernel=kernel, C=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Kernel: {kernel}")
    print(f" Accuracy: {accuracy:.3f}")
    print(f" Precision: {precision:.3f}")
    print(f" Recall: {recall:.3f}")
    print(f" F1 Score: {f1:.3f}")
    print("-" * 50)

print("\n SVM Model Training & Evaluation Completed!")


 Training SVM with linear kernel...

Kernel: linear
 Accuracy: 0.785
 Precision: 0.575
 Recall: 0.469
 F1 Score: 0.517
--------------------------------------------------

 Training SVM with rbf kernel...

Kernel: rbf
 Accuracy: 0.765
 Precision: 0.545
 Recall: 0.245
 F1 Score: 0.338
--------------------------------------------------

 Training SVM with poly kernel...

Kernel: poly
 Accuracy: 0.755
 Precision: 0.000
 Recall: 0.000
 F1 Score: 0.000
--------------------------------------------------

 Training SVM with sigmoid kernel...

Kernel: sigmoid
 Accuracy: 0.780
 Precision: 0.581
 Recall: 0.367
 F1 Score: 0.450
--------------------------------------------------

 SVM Model Training & Evaluation Completed!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
