In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import CCA
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from matplotlib.patches import Patch
from math import pi

# Load dataset
file_path = r"C:\Users\91944\Desktop\Research\Crime\All datasets\cyber-crimes-from-ncrb-master-data-year-state-and-city-wise-total-number-of-cyber-crimes-committed-in-india.csv"
df = pd.read_csv(file_path)

# Filter for years 2018-2022 and remove 'Total Cities' rows
df = df[(df['year'] >= 2018) & (df['year'] <= 2022)]
df = df[df['city'] != 'Total Cities']
df = df.dropna(subset=['state', 'value'])

# Aggregate IT Act violations by state and year
# Since dataset is total cybercrimes, assume all are IT Act violations for this analysis
agg = df.groupby(['state', 'year'])['value'].sum().unstack(fill_value=0)

# ---------------------------
# 1. Canonical Correlation Analysis (CCA)
# ---------------------------
# Split years into two groups for CCA: 2018-2020 and 2021-2022
X = agg[[2018, 2019, 2020]].values
Y = agg[[2021, 2022]].values

cca = CCA(n_components=2)
cca.fit(X, Y)
X_c, Y_c = cca.transform(X, Y)

# Plot CCA Biplot
plt.figure(figsize=(14, 8))
plt.scatter(X_c[:, 0], X_c[:, 1], c='blue', alpha=0.6)

for i, state in enumerate(agg.index):
    plt.text(X_c[i, 0]+0.02, X_c[i, 1], state, fontsize=8)

# Plot arrows for year variables in X and Y
for i, year in enumerate([2018, 2019, 2020]):
    plt.arrow(0, 0, cca.x_weights_[i, 0], cca.x_weights_[i, 1], color='red', width=0.005, head_width=0.05)
    plt.text(cca.x_weights_[i, 0]*1.2, cca.x_weights_[i, 1]*1.2, str(year), color='red', fontsize=10)

for i, year in enumerate([2021, 2022]):
    plt.arrow(0, 0, cca.y_weights_[i, 0], cca.y_weights_[i, 1], color='green', width=0.005, head_width=0.05)
    plt.text(cca.y_weights_[i, 0]*1.2, cca.y_weights_[i, 1]*1.2, str(year), color='green', fontsize=10)

plt.xlabel('Canonical Component 1')
plt.ylabel('Canonical Component 2')
plt.title('CCA Biplot: IT Act Violations Across States (2018-2022)')
plt.grid(True)
plt.tight_layout()
plt.savefig('cca_biplot_it_act_violations.png', dpi=300)
plt.close()
print("CCA Biplot saved as 'cca_biplot_it_act_violations.png'")

# ---------------------------
# 2. Temporal Heatmap of IT Act Violations by State and Year
# ---------------------------
plt.figure(figsize=(16, 12))
sns.heatmap(agg, annot=True, fmt='g', cmap='YlGnBu')
plt.title('Heatmap: IT Act Violations by State and Year (2018-2022)')
plt.xlabel('Year')
plt.ylabel('State')
plt.tight_layout()
plt.savefig('heatmap_it_act_violations.png', dpi=300)
plt.close()
print("Heatmap saved as 'heatmap_it_act_violations.png'")

# ---------------------------
# 3. Radar Chart of IT Act Violation Trends for Top 5 States
# ---------------------------
# Select top 5 states by total violations over 2018-2022
agg['Total'] = agg.sum(axis=1)
top5_states = agg.sort_values('Total', ascending=False).head(5).drop(columns='Total')

categories = list(top5_states.columns)
N = len(categories)

angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1]  # Complete loop

plt.figure(figsize=(10,10))
ax = plt.subplot(111, polar=True)

colors = sns.color_palette('hsv', n_colors=5)

for idx, (state, row) in enumerate(top5_states.iterrows()):
    values = row.values.flatten().tolist()
    values += values[:1]  # Complete loop
    ax.plot(angles, values, color=colors[idx], linewidth=2, label=state)
    ax.fill(angles, values, color=colors[idx], alpha=0.25)

plt.xticks(angles[:-1], categories)
ax.set_rlabel_position(30)
plt.yticks(fontsize=8)
plt.title('Radar Chart: IT Act Violation Trends for Top 5 States (2018-2022)', size=14, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
plt.tight_layout()
plt.savefig('radar_chart_top5_states.png', dpi=300)
plt.close()
print("Radar chart saved as 'radar_chart_top5_states.png'")

# ---------------------------
# 4. Sankey Diagram of IT Act Violation Flows (Simplified)
# ---------------------------
# Sankey between years: sum violations per year (all states combined)
yearly_totals = agg.sum(axis=0)

# Prepare Sankey data: from year i to year i+1
labels = [str(year) for year in yearly_totals.index]
sources = [0,1,2,3]  # indices for years 2018-2021
targets = [1,2,3,4]  # indices for years 2019-2022
values = [yearly_totals[2018], yearly_totals[2019], yearly_totals[2020], yearly_totals[2021]]

fig = go.Figure(data=[go.Sankey(
    node=dict(pad=15, thickness=20, line=dict(color="black", width=0.5), label=labels),
    link=dict(source=sources, target=targets, value=values)
)])

fig.update_layout(title_text="Sankey Diagram: IT Act Violation Flows Between Years (2018-2022)", font_size=10)
fig.write_image("sankey_it_act_violations.png")
print("Sankey diagram saved as 'sankey_it_act_violations.png'")

# ---------------------------
# 5. Stacked Area Chart of IT Act Violations Over Time with CCA-Based Groupings
# ---------------------------
# Cluster states based on first canonical component (X_c[:,0])
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_c[:, [0]])

agg['Cluster'] = clusters

# Aggregate violations by cluster and year
cluster_year = agg.groupby('Cluster').sum().drop(columns=['Total']).T

plt.figure(figsize=(14, 8))
cluster_year.plot.area(alpha=0.6)
plt.title('Stacked Area Chart: IT Act Violations Over Time by CCA-Based State Clusters')
plt.xlabel('Year')
plt.ylabel('Number of Violations')
plt.legend(title='Cluster')
plt.tight_layout()
plt.savefig('stacked_area_cca_clusters.png', dpi=300)
plt.close()
print("Stacked area chart saved as 'stacked_area_cca_clusters.png'")

CCA Biplot saved as 'cca_biplot_it_act_violations.png'
Heatmap saved as 'heatmap_it_act_violations.png'
Radar chart saved as 'radar_chart_top5_states.png'
