In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Style for plots
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

In [None]:
df = pd.read_csv("creditcard.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df['Class'] = df['Class'].astype(int)  # ensure column is int

sns.countplot(x='Class', hue='Class', data=df, palette={0: 'skyblue', 1: 'red'}, legend=False)
plt.title("Class Distribution (0 = Normal, 1 = Fraud)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

fraud_ratio = df['Class'].value_counts(normalize=True)[1] * 100
print(f"Fraudulent transactions: {fraud_ratio:.4f}%")

In [None]:
df.drop(columns=['Time'], inplace=True)

In [None]:
from sklearn.preprocessing import StandardScaler


X = df.drop(columns=['Class'])
y = df['Class']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)  # reduce to 10 principal components
X_pca_reduced = pca.fit_transform(X_scaled)

In [None]:
counts = df.Class.value_counts()


plt.figure(figsize=(5, 5))
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', 
        startangle=140)

plt.title('Distribution of a Target Variable')
plt.axis('equal')  

plt.tight_layout()
plt.show()


In [None]:
df['Class'] = df['Class'].astype(int)

sns.boxplot(x='Class', y='Amount', hue='Class', data=df,
            palette={0: 'lightgreen', 1: 'orangered'}, legend=False)

plt.title("Transaction Amount by Class")
plt.xlabel("Class")
plt.ylabel("Amount")
plt.show()


In [None]:
corr = df.corr()

top_corr = corr['Class'].abs().sort_values(ascending=False).head(10)

print("Top 10 features most correlated with Class:\n", top_corr)

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr[top_corr.index].loc[top_corr.index], annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Top Correlated Features with Class")
plt.show()

In [None]:
sns.histplot(df['Amount'], bins=5, color='purple', kde=True)
plt.title("Distribution of Transaction Amounts")
plt.xlabel("Amount")
plt.ylabel("Frequency")
plt.show()

In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# 5 nearest neighbors since min_samples = 5
neighbors = NearestNeighbors(n_neighbors=5)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)

# Sort and plot the 5th nearest distances
distances = np.sort(distances[:, 4])
plt.plot(distances)
plt.title("k-distance graph (use elbow point as eps)")
plt.xlabel("Points sorted by distance")
plt.ylabel("5th Nearest Neighbor Distance")
plt.grid()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_sample, _, = train_test_split(X_pca_reduced, train_size=10000, random_state=42)

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
import numpy as np

eps_values = [1.5, 1.8, 2.0, 2.2, 2.4]
min_samples_values = [3, 5, 7]

for eps in eps_values:
    for min_samples in min_samples_values:
        db = DBSCAN(eps=eps, min_samples=min_samples)
        labels = db.fit_predict(X_sample)

        score = silhouette_score(X_sample, labels) if len(set(labels)) > 1 else -1
        noise = np.sum(labels == -1)
        clusters = len(set(labels)) - (1 if -1 in labels else 0)

        print(f"eps={eps}, min_samples={min_samples} → Clusters: {clusters}, Noise: {noise}, Silhouette: {score:.4f}")

In [None]:
from sklearn.model_selection import train_test_split

X_final_sample, _,y_final_sample,_= train_test_split(X_pca_reduced, df['Class'], train_size=50000, random_state=42)

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
db_final = DBSCAN(eps=2.2, min_samples=5)
labels_final = db_final.fit_predict(X_final_sample)

In [None]:
new_df=pd.DataFrame(X_final_sample)
new_df["Clusters"]=labels_final
print("Unique clusters found by DBSCAN:", np.unique(labels_final))

In [None]:
# Count how many points per cluster
cluster_counts = new_df.value_counts()
print("Points per cluster:\n", cluster_counts)

# How many are anomalies?
n_anomalies = (new_df['Clusters'] == -1).sum()
print(f"Total anomalies (cluster = -1): {n_anomalies}")

In [None]:
# Create a new column to label anomalies
new_df['Anomaly'] = new_df['Clusters'].apply(lambda x: 'Anomaly' if x == -1 else 'Normal')

# View anomaly distribution
sns.countplot(x='Anomaly', hue='Anomaly', data=new_df,palette={'Normal': 'blue', 'Anomaly': 'red'}, legend=False)
plt.title("DBSCAN-Detected Anomalies")
plt.ylabel("Count")
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca_vis = PCA(n_components=2)
X_pca_2d = pca_vis.fit_transform(X_final_sample)  # Use PCA-10 version as input

plt.figure(figsize=(8,5))
plt.scatter(X_pca_2d[:, 0], X_pca_2d[:, 1], c=labels_final, cmap='tab10', s=5)
plt.title("DBSCAN Clustering (50K Sample, PCA-reduced)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.colorbar(label='Cluster ID')
plt.grid(True)
plt.show()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_final_sample)

# Add PCA results to the original DataFrame
new_df['PCA1'] = pca_result[:, 0]
new_df['PCA2'] = pca_result[:, 1]

# Check explained variance
print(f"PCA Explained Variance Ratio: {pca.explained_variance_ratio_}")

In [None]:
# Visualize cluster distribution
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='PCA1',
    y='PCA2',
    hue='Clusters',
    data=new_df,
    palette='tab10',
    alpha=0.6,
    edgecolor=None,
    legend='full'
)
plt.title("DBSCAN Clusters Visualized in PCA Space", fontsize=14, weight='bold')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title='Cluster ID', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Highlight anomalies in red and normal in green
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='PCA1',
    y='PCA2',
    hue='Anomaly',
    data=new_df,
    palette={'Normal': 'green', 'Anomaly': 'red'},
    alpha=0.6,
    edgecolor=None
)
plt.title("Anomaly vs Normal Transactions in PCA Space", fontsize=14, weight='bold')
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title='Transaction Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
import numpy as np

sample_indices = np.random.choice(len(X_final_sample), size=5000, replace=False)
X_sample = X_final_sample[sample_indices]
labels_sample = labels_final[sample_indices]

n_clusters = len(set(labels_sample)) - (1 if -1 in labels_sample else 0)
silhouette = silhouette_score(X_sample, labels_sample) if n_clusters > 1 else -1
noise_count = np.sum(labels_sample == -1)

# Final Output
print(f"Sampled 5000 points from 50K PCA-reduced set:")
print(f"Clusters found: {n_clusters}")
print(f"Noise points in sample: {noise_count}")
print(f"Silhouette Score: {silhouette:.4f}")

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Reduce 10D PCA sample to 2D for visualization
X_pca_sample_2D = PCA(n_components=2).fit_transform(X_sample)

# Plot the clusters
plt.figure(figsize=(8, 5))
plt.scatter(X_pca_sample_2D[:, 0], X_pca_sample_2D[:, 1], c=labels_sample, cmap='tab10', s=8, alpha=0.8)
plt.title("DBSCAN Clusters on Sample (PCA-2D Projection)", fontsize=14, weight='bold')
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.grid(True)
plt.colorbar(label='Cluster ID')
plt.tight_layout()
plt.savefig("dbscan_sample_clusters.png", dpi=300)
plt.show()

In [None]:
print(f"Clusters: {n_clusters}, Noise points: {noise_count}, Silhouette: {silhouette:.4f}")

In [None]:
# Predicted: if cluster = -1, we assume it might be fraud
new_df['Predicted_Fraud'] = new_df['Clusters'].apply(lambda x: 1 if x == -1 else 0)

In [None]:
y_pred = np.where(X_final_sample == -1, 1, 0)
y_pred.shape

In [None]:
import numpy as np

# Flatten y_true to 1D if it's a DataFrame or 2D array
if hasattr(y_true, 'values'):
    y_true = y_true.values  # Convert pandas Series/DataFrame to NumPy array

y_true = np.ravel(y_true)  # Flattens it to 1D
y_pred = np.ravel(y_pred)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import numpy as np
import matplotlib.pyplot as plt

# Ensure y_true and y_pred are 1D arrays
y_true = np.ravel(y_final_sample)
y_pred = np.where(labels_final == -1, 1, 0)

# Shapes check (optional)
print("Shapes:", y_true.shape, y_pred.shape)

# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Fraud'])

# Plot with aesthetic customizations
fig, ax = plt.subplots(figsize=(6, 5))
disp.plot(ax=ax, cmap='YlOrRd', colorbar=False)  # You can try 'Blues', 'Purples', etc.

# Enhance title and axis labels
plt.title("💡 Confusion Matrix: DBSCAN vs True Labels", fontsize=14, weight='bold', color='darkblue')
plt.xlabel("Predicted Label", fontsize=12)
plt.ylabel("True Label", fontsize=12)
plt.grid(False)

# Make tick labels bold and larger
ax.tick_params(axis='both', labelsize=12)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontweight('bold')

plt.tight_layout()
plt.show()

In [None]:
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=["Normal", "Fraud"]))

In [None]:
import joblib
joblib.dump(db_final,"dbscan_model.pkl")