# üîé Data Visualization

### üì¶üîß Imports & Setup

In [None]:
!pip install -q gdown umap-learn

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.impute import SimpleImputer
from scipy.stats import ks_2samp, chi2_contingency


### üìä Plotting Defaults

In [None]:
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10,6)

####üíæInitial Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


ValueError: mount failed

####üìÇ Load Data

In [None]:
real_path = "/content/drive/MyDrive/GNCIPL_DATASET/Bank_Transaction.csv"
synthetic_path = "/content/drive/MyDrive/GNCIPL_DATASET/Synthetic_Bank_Data.csv"
augmented_path = "/content/drive/MyDrive/GNCIPL_DATASET/Augmented_data.csv"

import pandas as pd

real = pd.read_csv(real_path)
synthetic = pd.read_csv(synthetic_path)
augmented = pd.read_csv(augmented_path)


print("Datasets loaded successfully:")
print("Real data shape:", real.shape)
print("Synthetic data shape:", synthetic.shape)
print("Augmented data shape:", augmented.shape)


#### üìù Quick Overviews

In [None]:
def short_info(df, name, n=3):
    print(f"\n=== {name} ===")
    print("shape:", df.shape)
    display(df.head(n))
    print("dtypes:")
    print(df.dtypes.value_counts())
    print("missing (%) per column (top 10):")
    miss = df.isna().mean().sort_values(ascending=False).head(10)
    display(miss)

short_info(real, "Real (Bank_Transaction)")
short_info(synthetic, "Synthetic (Synthetic_Bank_Data)")
short_info(augmented, "Augmented")


**Insight**

- Real data is usually highly imbalanced (few fraud cases).

- Synthetic data may balance the classes depending on generation method.

- Augmented data should show more fraud samples, improving model training.

#### ‚öñÔ∏è Compare Class Distribution

In [None]:
#compare class distribution

plt.figure(figsize=(15, 5))

datasets = [real, synthetic, augmented]
titles = ['Real (Bank_Transaction)', 'Synthetic_Bank_Data', 'Augmented_Data']

for i, data in enumerate(datasets):
    plt.subplot(1, 3, i + 1)
    sns.countplot(x='Is_Fraud', hue='Is_Fraud', data=data, palette='coolwarm', legend=False)
    plt.title(f'{titles[i]} - Fraud vs Non-Fraud')
    plt.xlabel('Is_Fraud (0 = Genuine, 1 = Fraud)')
    plt.ylabel('Count')
    plt.tight_layout()

plt.show()

#### üìë Compare Dataset Columns

In [None]:
print("Real columns:\n", real.columns.tolist())
print("\nSynthetic columns:\n", synthetic.columns.tolist())
print("\nAugmented columns:\n", augmented.columns.tolist())


#### üìà Feature Distribution Comparison

In [None]:
# === 2Ô∏è‚É£ Feature Distribution Comparison ===
features = ['Transaction_Amount', 'Age', 'Account_Balance']

for col in features:
    plt.figure(figsize=(10, 5))
    sns.kdeplot(real[col], label='Real', fill=True, alpha=0.4)
    sns.kdeplot(synthetic[col], label='Synthetic', fill=True, alpha=0.4)
    sns.kdeplot(augmented[col], label='Augmented', fill=True, alpha=0.4)
    plt.title(f'Distribution Comparison for {col}')
    plt.legend()
    plt.show()

**Insight**

- Overlapping curves ‚Üí synthetic/augmented data preserves real distribution well.

- Large deviations ‚Üí indicates bias in synthetic generation.

- Helps validate quality and realism of generated data.

#### Correlation Heatmaps

In [None]:
#  Correlation Heatmaps
for data, name in zip([real, synthetic, augmented],
                      ['Real', 'Synthetic', 'Augmented']):
    plt.figure(figsize=(10, 8))
    numeric_data = data.select_dtypes(include=np.number)
    sns.heatmap(numeric_data.corr(), cmap='coolwarm', center=0)
    plt.title(f'Correlation Heatmap - {name} Data')
    plt.show()

**Insight**

- High correlation (near ¬±1) ‚Üí strong linear relationship between features.

- Look for differences between real and synthetic/augmented correlations ‚Üí ensures synthetic data preserves patterns.

- Helps detect redundant features or multicollinearity before modeling.

---

#### Outlier Comparison using Boxplots

In [None]:
#  Outlier Comparison using Boxplots
for col in features:
    combined = pd.concat([
        real[[col]].assign(Source='Real'),
        synthetic[[col]].assign(Source='Synthetic'),
        augmented[[col]].assign(Source='Augmented')
    ])
    plt.figure(figsize=(10, 5))
    sns.boxplot(data=combined, x='Source',hue="Source", y=col, palette='Set2',legend=False)
    plt.title(f'Outlier Comparison for {col}')
    plt.show()

**Insight**

- Boxplots highlight outliers and distribution spread.

- Compare Real vs Synthetic/Augmented to check if synthetic data preserves extremes.

- Useful for detecting anomalies and deciding if clipping or transformation is needed.

#### Dimensionality Visualization using PCA

In [None]:
#  Dimensionality Visualization using PCA
scaler = StandardScaler()

real_scaled = scaler.fit_transform(real.select_dtypes(include=np.number).dropna())
synthetic_scaled = scaler.fit_transform(synthetic.select_dtypes(include=np.number).dropna())
augmented_scaled = scaler.fit_transform(augmented.select_dtypes(include=np.number).dropna())

pca = PCA(n_components=2)
real_pca = pca.fit_transform(real_scaled)
synthetic_pca = pca.fit_transform(synthetic_scaled)
augmented_pca = pca.fit_transform(augmented_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(real_pca[:, 0], real_pca[:, 1], alpha=0.5, label='Real')
plt.scatter(synthetic_pca[:, 0], synthetic_pca[:, 1], alpha=0.5, label='Synthetic')
plt.scatter(augmented_pca[:, 0], augmented_pca[:, 1], alpha=0.5, label='Augmented')
plt.title("PCA Projection - Real vs Synthetic vs Augmented Data")
plt.legend()
plt.show()

**Insight**

- PCA reduces high-dimensional data to 2D for visualization.

- Helps see clusters, overlaps, or separation between features.

- Compare real vs synthetic/augmented ‚Üí check if synthetic data preserves structure.

- Outliers or unusual patterns become visible.

---

#### t-SNE Visualization

In [None]:
# t-SNE Visualization
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
combined_scaled = np.vstack([real_scaled[:300], synthetic_scaled[:300], augmented_scaled[:300]])
labels = (['Real'] * 300) + (['Synthetic'] * 300) + (['Augmented'] * 300)

tsne_result = tsne.fit_transform(combined_scaled)
tsne_df = pd.DataFrame(tsne_result, columns=['Dim1', 'Dim2'])
tsne_df['Dataset'] = labels

plt.figure(figsize=(10, 6))
sns.scatterplot(data=tsne_df, x='Dim1', y='Dim2', hue='Dataset', alpha=0.7)
plt.title("t-SNE Visualization of Real, Synthetic, and Augmented Data")
plt.legend()
plt.show()

print("Visualization completed successfully!")

**Insight**

- t-SNE captures non-linear relationships and clusters better than PCA.

- Useful to visualize high-dimensional patterns in 2D.

- Compare real vs synthetic/augmented ‚Üí check if synthetic data preserves feature relationships.

- Can highlight clusters of fraud vs non-fraud transactions.