### Project

In [31]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.metrics import (
    silhouette_score, calinski_harabasz_score, davies_bouldin_score,
    confusion_matrix, roc_auc_score,average_precision_score,
    precision_score, recall_score, f1_score
)

In [5]:
df=pd.read_csv("E:\\Fraud Detection\\data\\train_data\\healthcare_fraud.csv")
df.head()

Unnamed: 0,claim_id,claim_amount,claim_type,member_age,chronic_conditions_count,length_of_stay_days,num_procedures,procedure_category,provider_specialty,days_since_policy_start,weekend_claim_flag,multiple_claims_same_day,amount_per_day_of_stay,cost_per_procedure,high_amount_flag,high_cost_per_procedure,rushed_claim,is_fraud
0,CLM100000,2446.03,outpatient,28,0,0,1,consultation,general,705,0,0,2446.03,2446.03,0,0,0,0
1,CLM100001,57383.85,hospitalization,66,3,11,10,surgery,cardiology,33,0,1,5216.71,5738.38,1,1,0,1
2,CLM100002,9185.75,dental,53,2,5,2,surgery,general,449,1,0,1837.15,4592.88,0,0,0,0
3,CLM100003,7140.03,outpatient,52,1,3,2,consultation,orthopedics,497,0,0,2380.01,3570.02,0,0,0,0
4,CLM100004,2277.99,outpatient,37,0,0,1,consultation,general,604,0,0,2277.99,2277.99,0,0,0,0


In [10]:
id_cols = ['claim_id']
label_cols = ['is_fraud']
exclude_for_training = id_cols + label_cols

feature_df = df.drop(columns=exclude_for_training)

print(f"Features for modeling: {feature_df.shape[1]}")

Features for modeling: 16


In [14]:
categorical_cols = ['claim_type', 'procedure_category', 'provider_specialty']
numerical_cols = [col for col in feature_df.columns if col not in categorical_cols]
print(f"Categorical features length: {len(categorical_cols)}")
print(f"Numerical features length: {len(numerical_cols)}")

Categorical features length: 3
Numerical features length: 13


In [15]:

feature_df_encoded = feature_df.copy()
for col in categorical_cols:
    le = LabelEncoder()
    feature_df_encoded[col] = le.fit_transform(feature_df[col].astype(str))
scaler = StandardScaler()
X_scaled = scaler.fit_transform(feature_df_encoded)
print(f"Final feature matrix: {X_scaled.shape}")

Final feature matrix: (2000, 16)


In [16]:
y_true = df['is_fraud'].values
print(f"Fraud: {y_true.sum()} ({y_true.mean():.2%})") # type: ignore
print(f"Normal: {(1-y_true).sum()} ({(1-y_true).mean():.2%})") # type: ignore

Fraud: 241 (12.05%)
Normal: 1759 (87.95%)


In [17]:

# PCA for modeling
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)
print(f"Reduced from {X_scaled.shape[1]} to {X_pca.shape[1]} components")
print(f"Explained variance: {pca.explained_variance_ratio_.sum():.2%}")

# PCA for 2D visualization
pca_2d = PCA(n_components=2, random_state=42)
X_pca_2d = pca_2d.fit_transform(X_scaled)
print(f"2D PCA explained variance: {pca_2d.explained_variance_ratio_.sum():.2%}")

tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=1000)
X_tsne = tsne.fit_transform(X_scaled)

Reduced from 16 to 10 components
Explained variance: 96.36%
2D PCA explained variance: 62.44%


In [18]:
results = {}

In [20]:
best_k = None
best_silhouette = -1

for k in range(2, 11):
    kmeans_temp = KMeans(
        n_clusters=k,
        init="k-means++",
        n_init=10,
        random_state=42
    )
    labels_temp = kmeans_temp.fit_predict(X_pca)
    sil_score = silhouette_score(X_pca, labels_temp)
    if sil_score > best_silhouette:
        best_silhouette = sil_score
        best_k = k

kmeans = KMeans(
    n_clusters=best_k,
    init="k-means++",
    n_init=10,
    random_state=42
)
kmeans_labels = kmeans.fit_predict(X_pca)

kmeans_silhouette = silhouette_score(X_pca, kmeans_labels)
kmeans_calinski = calinski_harabasz_score(X_pca, kmeans_labels)
kmeans_davies = davies_bouldin_score(X_pca, kmeans_labels)

results["kmeans"] = {
    "labels": kmeans_labels,
    "silhouette": kmeans_silhouette,
    "calinski": kmeans_calinski,
    "davies": kmeans_davies,
    "n_clusters": best_k,
}


In [21]:

agglo = AgglomerativeClustering(
    n_clusters=best_k,
    linkage="ward"
)
agglo_labels = agglo.fit_predict(X_pca)

agglo_silhouette = silhouette_score(X_pca, agglo_labels)
agglo_calinski = calinski_harabasz_score(X_pca, agglo_labels)
agglo_davies = davies_bouldin_score(X_pca, agglo_labels)

results["agglomerative"] = {
    "labels": agglo_labels,
    "silhouette": agglo_silhouette,
    "calinski": agglo_calinski,
    "davies": agglo_davies,
    "n_clusters": best_k,
}

In [22]:

dbscan = DBSCAN(eps=3.0, min_samples=10)
dbscan_labels = dbscan.fit_predict(X_pca)

n_clusters_dbscan = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

results["dbscan"] = {
    "labels": dbscan_labels,
    "n_clusters": n_clusters_dbscan,
    "n_noise": n_noise,
}


In [28]:
contamination_rate = y_true.mean() # type: ignore
print(f"Contamination rate: {contamination_rate:.4f}")

# Isolation Forest
print("1. Isolation Forest")

iso_forest = IsolationForest(contamination=contamination_rate, random_state=42, 
                             n_estimators=100, max_samples='auto')
iso_predictions = iso_forest.fit_predict(X_pca)
iso_scores = iso_forest.score_samples(X_pca)
iso_predictions_binary = (iso_predictions == -1).astype(int)

print(f"Anomalies: {iso_predictions_binary.sum()} ({iso_predictions_binary.mean():.2%})")
print(f"Score range: [{iso_scores.min():.4f}, {iso_scores.max():.4f}]")

results['isolation_forest'] = {
    'predictions': iso_predictions_binary, 'scores': iso_scores,
    'n_anomalies': iso_predictions_binary.sum()
}

# Local Outlier Factor
print("\n2. Local Outlier Factor (LOF)")

lof = LocalOutlierFactor(contamination=contamination_rate, n_neighbors=20, novelty=False)
lof_predictions = lof.fit_predict(X_pca)
lof_scores = lof.negative_outlier_factor_
lof_predictions_binary = (lof_predictions == -1).astype(int)

print(f"Anomalies: {lof_predictions_binary.sum()} ({lof_predictions_binary.mean():.2%})")
print(f"Score range: [{lof_scores.min():.4f}, {lof_scores.max():.4f}]")

results['lof'] = {
    'predictions': lof_predictions_binary, 'scores': lof_scores,
    'n_anomalies': lof_predictions_binary.sum()
}

Contamination rate: 0.1205
1. Isolation Forest
Anomalies: 241 (12.05%)
Score range: [-0.6388, -0.3522]

2. Local Outlier Factor (LOF)
Anomalies: 241 (12.05%)
Score range: [-2.9066, -0.9365]


In [29]:
evaluation_results = {}

def evaluate_clustering_for_fraud(cluster_labels, y_true):
    """Find cluster with highest fraud rate and use as anomaly cluster"""
    unique_clusters = [c for c in np.unique(cluster_labels) if c != -1]
    if len(unique_clusters) == 0:
        return None

    fraud_rates = {}
    for cluster in unique_clusters:
        mask = cluster_labels == cluster
        fraud_rates[cluster] = y_true[mask].mean()

    anomaly_cluster = max(fraud_rates, key=fraud_rates.get) # type: ignore
    predictions = (cluster_labels == anomaly_cluster).astype(int)
    return predictions

In [30]:
# Clustering models evaluation
for model_name in ['kmeans', 'agglomerative', 'dbscan']:
    cluster_labels = results[model_name]['labels']
    predictions = evaluate_clustering_for_fraud(cluster_labels, y_true)
    
    if predictions is not None:
        precision = precision_score(y_true, predictions)
        recall = recall_score(y_true, predictions)
        f1 = f1_score(y_true, predictions)
        cm = confusion_matrix(y_true, predictions)
        
        evaluation_results[model_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'confusion_matrix': cm
        }

# Anomaly detection models evaluation
for model_name in ['isolation_forest', 'lof']:
    predictions = results[model_name]['predictions']
    scores = results[model_name]['scores']
    
    precision = precision_score(y_true, predictions)
    recall = recall_score(y_true, predictions)
    f1 = f1_score(y_true, predictions)
    cm = confusion_matrix(y_true, predictions)
    
    try:
        roc_auc = roc_auc_score(y_true, -scores if model_name == 'lof' else scores)
    except:
        roc_auc = None
    
    evaluation_results[model_name] = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm
    }

# Summary table
summary_data = []
for model_name, metrics in evaluation_results.items():
    summary_data.append({
        'Model': model_name.upper().replace('_', ' '),
        'Precision': f"{metrics['precision']:.4f}",
        'Recall': f"{metrics['recall']:.4f}",
        'F1-Score': f"{metrics['f1']:.4f}",
        'ROC-AUC': f"{metrics['roc_auc']:.4f}" if metrics.get('roc_auc') else 'N/A'
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))


           Model Precision Recall F1-Score ROC-AUC
          KMEANS    0.4460 0.9253   0.6019     N/A
   AGGLOMERATIVE    0.4451 0.9253   0.6011     N/A
          DBSCAN    1.0000 0.0664   0.1245     N/A
ISOLATION FOREST    0.7344 0.7344   0.7344  0.0717
             LOF    0.0913 0.0913   0.0913  0.4421


In [34]:

output_dir = "E:\\Fraud Detection\\data\\output_data"
os.makedirs(output_dir, exist_ok=True)

sns.set_style("whitegrid")

In [35]:
# Figure 1: PCA Visualizations
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

axes[0, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=y_true, cmap='RdYlGn_r', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[0, 0].set_title('True Fraud Labels', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('PC1')
axes[0, 0].set_ylabel('PC2')

axes[0, 1].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=results['kmeans']['labels'], cmap='viridis', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[0, 1].set_title('K-Means Clusters', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('PC1')
axes[0, 1].set_ylabel('PC2')

axes[0, 2].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=results['agglomerative']['labels'], cmap='plasma', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[0, 2].set_title('Agglomerative Clusters', fontsize=12, fontweight='bold')
axes[0, 2].set_xlabel('PC1')
axes[0, 2].set_ylabel('PC2')

axes[1, 0].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=results['dbscan']['labels'], cmap='tab10', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[1, 0].set_title('DBSCAN Clusters', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('PC1')
axes[1, 0].set_ylabel('PC2')

axes[1, 1].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=results['isolation_forest']['predictions'], cmap='coolwarm', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[1, 1].set_title('Isolation Forest', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('PC1')
axes[1, 1].set_ylabel('PC2')

axes[1, 2].scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=results['lof']['predictions'], cmap='coolwarm', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[1, 2].set_title('LOF', fontsize=12, fontweight='bold')
axes[1, 2].set_xlabel('PC1')
axes[1, 2].set_ylabel('PC2')

plt.tight_layout()
plt.savefig(f'{output_dir}/plot1_pca_visualizations.png', dpi=300, bbox_inches='tight')
plt.close()


In [36]:
# Figure 2: t-SNE Visualizations
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_true, cmap='RdYlGn_r', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[0].set_title('t-SNE: True Labels', fontsize=12, fontweight='bold')

axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=results['kmeans']['labels'], cmap='viridis', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[1].set_title('t-SNE: K-Means', fontsize=12, fontweight='bold')

axes[2].scatter(X_tsne[:, 0], X_tsne[:, 1], c=results['isolation_forest']['predictions'], cmap='coolwarm', alpha=0.6, s=30, edgecolors='black', linewidth=0.3)
axes[2].set_title('t-SNE: Isolation Forest', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.savefig(f'{output_dir}/plot2_tsne_visualizations.png', dpi=300, bbox_inches='tight')
plt.close()

In [37]:
# Figure 3: Anomaly Score Distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

iso_scores = results['isolation_forest']['scores']
lof_scores = results['lof']['scores']

axes[0].hist(iso_scores[y_true==0], bins=50, alpha=0.6, label='Normal', color='green')
axes[0].hist(iso_scores[y_true==1], bins=50, alpha=0.6, label='Fraud', color='red')
axes[0].set_title('Isolation Forest Scores', fontweight='bold')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].hist(lof_scores[y_true==0], bins=50, alpha=0.6, label='Normal', color='green')
axes[1].hist(lof_scores[y_true==1], bins=50, alpha=0.6, label='Fraud', color='red')
axes[1].set_title('LOF Scores', fontweight='bold')
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(f'{output_dir}/plot3_anomaly_scores.png', dpi=300, bbox_inches='tight')
plt.close()

In [38]:
# Figure 4: Model Performance Comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

models = list(evaluation_results.keys())
metrics_data = {
    'Precision': [evaluation_results[m]['precision'] for m in models],
    'Recall': [evaluation_results[m]['recall'] for m in models],
    'F1-Score': [evaluation_results[m]['f1'] for m in models]
}

x = np.arange(len(models))
width = 0.25

for i, (metric, values) in enumerate(metrics_data.items()):
    axes[0].bar(x + i*width, values, width, label=metric)

axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].set_title('Model Performance Comparison', fontweight='bold')
axes[0].set_xticks(x + width)
axes[0].set_xticklabels([m.upper().replace('_', ' ') for m in models], rotation=45, ha='right')
axes[0].legend()
axes[0].grid(alpha=0.3, axis='y')

perf_matrix = pd.DataFrame(metrics_data, index=[m.upper().replace('_', ' ') for m in models])
sns.heatmap(perf_matrix, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[1])
axes[1].set_title('Performance Heatmap', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{output_dir}/plot4_performance.png', dpi=300, bbox_inches='tight')
plt.close()

In [39]:
# Figure 5: Confusion Matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (model_name, metrics) in enumerate(evaluation_results.items()):
    cm = metrics['confusion_matrix']
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Normal', 'Fraud'], yticklabels=['Normal', 'Fraud'])
    axes[idx].set_title(f'{model_name.upper().replace("_", " ")}', fontweight='bold')
    axes[idx].set_xlabel('Predicted')
    axes[idx].set_ylabel('True')

for i in range(len(evaluation_results), 6):
    axes[i].axis('off')

plt.tight_layout()
plt.savefig(f'{output_dir}/plot5_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.close()

In [40]:

# Figure 6: Cluster Analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

cluster_data = []
for model_name in ['kmeans', 'agglomerative', 'dbscan']:
    labels = results[model_name]['labels']
    for label in np.unique(labels):
        if label != -1:
            mask = labels == label
            cluster_data.append({
                'Model': model_name.upper(),
                'Cluster': f"{model_name.upper()}-C{label}",
                'Size': mask.sum(),
                'Fraud_Count': y_true[mask].sum(),
                'Fraud_Rate': y_true[mask].mean() * 100,
                'Avg_Amount': df.loc[mask, 'claim_amount'].mean()
            })

cluster_df = pd.DataFrame(cluster_data)

cluster_df.plot(x='Cluster', y='Fraud_Rate', kind='bar', ax=axes[0,0], legend=False, color='red', alpha=0.7)
axes[0,0].set_title('Fraud Rate by Cluster (%)', fontweight='bold')
axes[0,0].set_ylabel('Fraud Rate (%)')
axes[0,0].grid(alpha=0.3, axis='y')

cluster_df.plot(x='Cluster', y='Size', kind='bar', ax=axes[0,1], legend=False, color='blue', alpha=0.7)
axes[0,1].set_title('Cluster Size', fontweight='bold')
axes[0,1].set_ylabel('Number of Claims')
axes[0,1].grid(alpha=0.3, axis='y')

cluster_df.plot(x='Cluster', y='Avg_Amount', kind='bar', ax=axes[1,0], legend=False, color='green', alpha=0.7)
axes[1,0].set_title('Average Claim Amount by Cluster', fontweight='bold')
axes[1,0].set_ylabel('Average Amount ($)')
axes[1,0].grid(alpha=0.3, axis='y')

cluster_df.plot(x='Cluster', y='Fraud_Count', kind='bar', ax=axes[1,1], legend=False, color='orange', alpha=0.7)
axes[1,1].set_title('Fraud Count by Cluster', fontweight='bold')
axes[1,1].set_ylabel('Number of Frauds')
axes[1,1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(f'{output_dir}/plot6_cluster_analysis.png', dpi=300, bbox_inches='tight')
plt.close()


In [41]:
# Extract anomaly scores
iso_scores = results['isolation_forest']['scores']
lof_scores = results['lof']['scores']

# Get top 50 most suspicious claims (lowest scores = most anomalous)
top50_iso_idx = np.argsort(iso_scores)[:50]
top50_lof_idx = np.argsort(lof_scores)[:50]

# Create report for Isolation Forest top 50
iso_report = df.iloc[top50_iso_idx].copy()
iso_report['anomaly_score'] = iso_scores[top50_iso_idx]
iso_report['rank'] = range(1, 51)
iso_report = iso_report[['rank', 'claim_id', 'claim_amount', 'claim_type', 'member_age', 
                         'num_procedures', 'length_of_stay_days', 'days_since_policy_start',
                         'multiple_claims_same_day', 'anomaly_score', 'is_fraud']]

iso_report.to_csv(f'{output_dir}/top50_isolation_forest.csv', index=False)

# Create report for LOF top 50
lof_report = df.iloc[top50_lof_idx].copy()
lof_report['anomaly_score'] = lof_scores[top50_lof_idx]
lof_report['rank'] = range(1, 51)
lof_report = lof_report[['rank', 'claim_id', 'claim_amount', 'claim_type', 'member_age',
                         'num_procedures', 'length_of_stay_days', 'days_since_policy_start',
                         'multiple_claims_same_day', 'anomaly_score', 'is_fraud']]

lof_report.to_csv(f'{output_dir}/top50_lof.csv', index=False)

print(f"Top 50 anomalies saved:")
print(f"  - {output_dir}/top50_isolation_forest.csv")
print(f"  - {output_dir}/top50_lof.csv")


Top 50 anomalies saved:
  - E:\Fraud Detection\data\output_data/top50_isolation_forest.csv
  - E:\Fraud Detection\data\output_data/top50_lof.csv


In [43]:
import pickle
import os

output_dir = "E:\\Fraud Detection\\models"
os.makedirs(output_dir, exist_ok=True)

# Save scaler
with open(f'{output_dir}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save PCA
with open(f'{output_dir}/pca.pkl', 'wb') as f:
    pickle.dump(pca, f)

# Save label encoders
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    le.fit(df[col].astype(str))
    label_encoders[col] = le

with open(f'{output_dir}/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

# Get the Isolation Forest model from your results dict
iso_model = IsolationForest(contamination=0.12, random_state=42, n_estimators=100)
iso_model.fit(X_pca)

with open(f'{output_dir}/isolation_forest.pkl', 'wb') as f:
    pickle.dump(iso_model, f)

# For LOF, need to use novelty=True for new predictions
lof_for_pred = LocalOutlierFactor(contamination=0.12, novelty=True)
lof_for_pred.fit(X_pca)

with open(f'{output_dir}/lof.pkl', 'wb') as f:
    pickle.dump(lof_for_pred, f)


print("Models saved successfully!")


Models saved successfully!
