In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.patheffects as path_effects

# Set style for publication-quality visualizations
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman', 'DejaVu Serif']
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 20

# Create sample data as in the original code
states = ['Telangana', 'Maharashtra', 'Andhra Pradesh', 'Odisha', 'Uttar Pradesh',
          'Bihar', 'Ladakh', 'Daman and Diu', 'Dadra and Nagar Haveli']
fraud_types = ['Identity Theft', 'Cheating by personation',
               'Online Banking Fraud', 'OTP Fraud']

np.random.seed(42)
data = []
for state in states:
    for fraud_type in fraud_types:
        if state == 'Telangana':
            value = np.random.randint(2000, 5000)
        elif state in ['Maharashtra', 'Andhra Pradesh', 'Odisha', 'Uttar Pradesh']:
            value = np.random.randint(500, 2000)
        elif state == 'Bihar':
            value = np.random.randint(800, 1500)
        else:
            value = np.random.randint(10, 500)

        data.append({'state': state, 'offencesubcategory': fraud_type,
                    'value': value, 'year': 2022})

# Create DataFrame and process as before
fraud_data = pd.DataFrame(data)
fraud_pivot = fraud_data.pivot_table(
    index='state',
    columns='offencesubcategory',
    values='value',
    fill_value=0
)
fraud_pivot['Total'] = fraud_pivot.sum(axis=1)
fraud_pivot = fraud_pivot.sort_values('Total', ascending=False)

# Prepare data for clustering
X = fraud_pivot.drop('Total', axis=1).values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
fraud_pivot['cluster'] = clusters
fraud_pivot['pca1'] = X_pca[:, 0]
fraud_pivot['pca2'] = X_pca[:, 1]

# Now create individual plots

# 1. Cluster Visualization
plt.figure(figsize=(14, 10))
scatter = plt.scatter(
    fraud_pivot['pca1'],
    fraud_pivot['pca2'],
    c=fraud_pivot['cluster'],
    cmap='viridis',
    s=fraud_pivot['Total']/10 + 150,
    alpha=0.8,
    edgecolors='white',
    linewidth=1.5
)

for i, state in enumerate(fraud_pivot.index):
    text = plt.annotate(
        state,
        (fraud_pivot['pca1'].iloc[i], fraud_pivot['pca2'].iloc[i]),
        fontsize=12,
        ha='center',
        va='center',
        weight='bold',
        color='black',
        bbox=dict(
            boxstyle="round,pad=0.4",
            fc="white",
            ec="gray",
            alpha=0.9,
            linewidth=1
        )
    )
    text.set_path_effects(
        [path_effects.withStroke(linewidth=3, foreground='white')])

cbar = plt.colorbar(scatter, label='Cluster')
cbar.set_label('Cluster Group', size=14, weight='bold')
cbar.ax.tick_params(labelsize=12)

sizes = [100, 500, 1000, 5000]
labels = ['100', '500', '1,000', '5,000']
legend_bubbles = []
for size in sizes:
    legend_bubbles.append(plt.scatter(
        [], [], s=size/10+150, c='gray', alpha=0.6, edgecolors='white'))

plt.legend(
    legend_bubbles,
    labels,
    title='Number of Cases',
    loc='upper right',
    frameon=True,
    title_fontsize=14,
    facecolor='white',
    framealpha=0.9,
    edgecolor='lightgray'
)

plt.title('Clustering of States Based on Fraud Patterns (2022)',
          fontsize=20, pad=20, weight='bold')
plt.xlabel('Principal Component 1', fontsize=16)
plt.ylabel('Principal Component 2', fontsize=16)
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('fraud_clusters.png', dpi=300, bbox_inches='tight')
plt.close()

# 2. Fraud Type Distribution
plt.figure(figsize=(14, 8))
top_states = fraud_pivot.head(6).drop(
    ['cluster', 'pca1', 'pca2', 'Total'], axis=1)
top_states.plot(
    kind='barh',
    stacked=True,
    colormap='viridis',
    width=0.7,
    edgecolor='white',
    linewidth=0.8
)

plt.title('Distribution of Fraud Types Across Top States',
          fontsize=18, pad=15, weight='bold')
plt.xlabel('Number of Cases', fontsize=14)
plt.ylabel('State', fontsize=14)
plt.legend(
    title='Fraud Type',
    bbox_to_anchor=(1.05, 1),
    loc='upper left',
    fontsize=12,
    title_fontsize=14,
    frameon=True,
    facecolor='white',
    framealpha=0.9,
    edgecolor='lightgray'
)

for i, state in enumerate(top_states.index):
    total = fraud_pivot.loc[state, 'Total']
    plt.text(
        total + 50,
        i,
        f"{int(total):,}",
        va='center',
        ha='left',
        fontweight='bold',
        fontsize=12,
        bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="gray", alpha=0.8)
    )

plt.tight_layout()
plt.savefig('fraud_types_distribution.png', dpi=300, bbox_inches='tight')
plt.close()

# 3. Correlation Heatmap
plt.figure(figsize=(12, 10))
corr_matrix = fraud_pivot.drop(
    ['cluster', 'pca1', 'pca2', 'Total'], axis=1).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)

sns.heatmap(
    corr_matrix,
    annot=True,
    cmap=cmap,
    vmin=-1,
    vmax=1,
    linewidths=0.8,
    cbar_kws={"shrink": 0.8},
    annot_kws={"size": 12, "weight": "bold"},
    fmt=".2f",
    square=True,
    mask=mask
)

plt.title('Correlation Between Different Types of Fraud',
          fontsize=18, pad=15, weight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('fraud_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()

# 4. Cluster Groups Table
plt.figure(figsize=(12, 6))
plt.axis('off')
plt.title('States Grouped by Fraud Pattern Clusters',
          fontsize=18, pad=15, weight='bold')

cluster_groups = {}
for cluster in range(4):
    states = fraud_pivot[fraud_pivot['cluster'] == cluster].index.tolist()
    cluster_groups[f"Cluster {cluster}"] = ", ".join(states)

cluster_df = pd.DataFrame(list(cluster_groups.items()),
                          columns=['Cluster', 'States'])

table = plt.table(
    cellText=cluster_df.values,
    colLabels=cluster_df.columns,
    cellLoc='center',
    loc='center',
    colWidths=[0.15, 0.75]
)
table.auto_set_font_size(False)
table.set_fontsize(14)
table.scale(1, 2)

for key, cell in table.get_celld().items():
    if key[0] == 0:  # Header
        cell.set_text_props(weight='bold', color='white')
        cell.set_facecolor('#4C72B0')
    else:  # Data rows
        if key[0] % 2 == 0:
            cell.set_facecolor('#F0F0F0')
        if key[1] == 0:  # Cluster column
            cell.set_text_props(weight='bold')

plt.tight_layout()
plt.savefig('fraud_cluster_groups.png', dpi=300, bbox_inches='tight')
plt.close()

# 5. Top States Bar Chart
plt.figure(figsize=(14, 8))
top_10 = fraud_pivot.sort_values('Total', ascending=True).tail(10)
colors = plt.cm.Reds(np.linspace(0.4, 0.8, len(top_10)))

bars = plt.barh(top_10.index, top_10['Total'], color=colors)
plt.title('Top 10 States with Highest Fraud-Related Cybercrimes (2022)',
          fontsize=18, pad=15, weight='bold')
plt.xlabel('Number of Cases', fontsize=14)
plt.ylabel('State', fontsize=14)
plt.grid(axis='x', linestyle='--', alpha=0.7)

for i, bar in enumerate(bars):
    plt.text(
        bar.get_width() + (top_10['Total'].max() * 0.02),
        bar.get_y() + bar.get_height()/2,
        f"{int(top_10['Total'].iloc[i]):,}",
        va='center',
        ha='left',
        fontweight='bold',
        fontsize=12
    )

plt.tight_layout()
plt.savefig('fraud_top_states.png', dpi=300, bbox_inches='tight')
plt.close()

print("Analysis complete. All visualizations have been saved as separate files.")


Analysis complete. All visualizations have been saved as separate files.


<Figure size 1400x800 with 0 Axes>