# Assignment 14 — Anomaly Detection
Gaussian Mixture Model & Isolation Forest (Credit Card Fraud)

## Import Libraries

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest


## Load Dataset (Provide path to Kaggle file)

In [None]:

# Replace path if dataset exists locally
try:
    df = pd.read_csv("creditcard.csv")
except Exception as e:
    # fallback synthetic dataset to allow notebook to run
    np.random.seed(42)
    normal = pd.DataFrame(np.random.normal(0,1,(900,4)), columns=['v1','v2','v3','amount'])
    fraud = pd.DataFrame(np.random.normal(2,1,(30,4)), columns=['v1','v2','v3','amount'])
    normal['Class']=0
    fraud['Class']=1
    df = pd.concat([normal,fraud], ignore_index=True)

df.head()


## Inspect Class Distribution

In [None]:

df['Class'].value_counts()


## Visualize Imbalance

In [None]:

sns.countplot(x='Class', data=df)
plt.title("Fraud vs Normal Distribution")
plt.show()


## Select Features & Scale

In [None]:

X = df.drop('Class', axis=1)
y = df['Class']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled[:5]


## Gaussian Mixture Model (GMM)

In [None]:

gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(X_scaled)

labels = gmm.predict(X_scaled)

# identify fraud cluster by higher mean
cluster_mean = [X_scaled[labels==i].mean() for i in range(2)]
fraud_cluster = int(np.argmax(cluster_mean))

gmm_pred = (labels == fraud_cluster).astype(int)

print(classification_report(y, gmm_pred))


## Isolation Forest

In [None]:

iso = IsolationForest(contamination=0.03, random_state=42)
iso.fit(X_scaled)

iso_scores = iso.predict(X_scaled)
iso_pred = (iso_scores == -1).astype(int)

print(classification_report(y, iso_pred))


## Confusion Matrices

In [None]:

print("GMM Confusion Matrix")
print(confusion_matrix(y, gmm_pred))

print("\nIsolation Forest Confusion Matrix")
print(confusion_matrix(y, iso_pred))


## PCA Visualization (Optional)

In [None]:

from sklearn.decomposition import PCA

pca = PCA(2)
X_vis = pca.fit_transform(X_scaled)

plt.figure(figsize=(6,4))
plt.scatter(X_vis[:,0], X_vis[:,1], c=gmm_pred, cmap='coolwarm', s=6)
plt.title("GMM — Fraud vs Normal")
plt.show()

plt.figure(figsize=(6,4))
plt.scatter(X_vis[:,0], X_vis[:,1], c=iso_pred, cmap='coolwarm', s=6)
plt.title("Isolation Forest — Fraud vs Normal")
plt.show()


## Discussion
Write about:
- imbalance challenges
- false positives vs false negatives
- when each method works better
- ethical risks of wrong detection.