In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings('ignore')

# Setup plotting
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 10

print("Libraries loaded successfully")

## Load and Explore Data

In [None]:
# Load the data
df = pd.read_csv('CrimesOnWomenData.csv', index_col=0)
desc = pd.read_csv('description.csv', index_col=0)

print(f"Dataset: {df.shape[0]} records, {df.shape[1]} columns")
print(f"\nYears covered: {df['Year'].min()} to {df['Year'].max()}")
print(f"States: {df['State'].nunique()}")
print(f"\nMissing values: {df.isnull().sum().sum()}")

df = df.fillna(0)
print("\nData ready for analysis")

## Task 1: States with Highest Crime Against Women

In [None]:
crimes = ['Rape', 'K&A', 'DD', 'AoW', 'AoM', 'DV', 'WT']
crime_labels = {
    'Rape': 'Rape',
    'K&A': 'Kidnapping & Assault',
    'DD': 'Dowry Deaths',
    'AoW': 'Assault on Women',
    'AoM': 'Assault on Modesty',
    'DV': 'Domestic Violence',
    'WT': 'Women Trafficking'
}

# Get total by state
state_totals = df.groupby('State')[crimes].sum().sum(axis=1).sort_values(ascending=False)

print("Top 15 States (Total Cases):")
for i, (state, count) in enumerate(state_totals.head(15).items(), 1):
    print(f"{i:2}. {state:25} {count:>10,.0f}")

state_avg = df.groupby('State')[crimes].sum().sum(axis=1) / df.groupby('State').size()
state_avg = state_avg.sort_values(ascending=False)

print("\nTop 15 States (Avg per Year):")
for i, (state, count) in enumerate(state_avg.head(15).items(), 1):
    print(f"{i:2}. {state:25} {count:>10,.0f}")

In [None]:
# Visualize top states
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

state_totals.head(15).plot(kind='bar', ax=axes[0], color='darkred', alpha=0.8)
axes[0].set_title('States with Most Crimes Against Women', fontsize=13, fontweight='bold')
axes[0].set_ylabel('Total Cases')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

state_avg.head(15).plot(kind='bar', ax=axes[1], color='crimson', alpha=0.8)
axes[1].set_title('States by Average Crimes per Year', fontsize=13, fontweight='bold')
axes[1].set_ylabel('Avg Cases per Year')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Task 2: Clustering Analysis

In [None]:
# Prepare data for clustering
state_data = df.groupby('State')[crimes].sum()
scaler = StandardScaler()
scaled = scaler.fit_transform(state_data)

# Find optimal k
inertias = []
silhouettes = []
k_range = range(2, 11)

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(scaled)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(scaled, km.labels_))

print("Silhouette scores by cluster count:")
for k, s in zip(k_range, silhouettes):
    print(f"  k={k}: {s:.3f}")

In [None]:
# Plot elbow curve and silhouette scores
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_xlabel('Number of Clusters')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method', fontsize=12, fontweight='bold')
axes[0].grid(True, alpha=0.3)

axes[1].plot(k_range, silhouettes, 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Number of Clusters')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis', fontsize=12, fontweight='bold')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Apply clustering with k=4
optimal_k = 4
km = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = km.fit_predict(scaled)
state_data['Cluster'] = clusters

print(f"Clusters identified: {optimal_k}\n")
for i in range(optimal_k):
    cluster_states = state_data[state_data['Cluster'] == i].index.tolist()
    total = state_data[state_data['Cluster'] == i][crimes].sum().sum()
    print(f"Cluster {i}: {len(cluster_states)} states, {int(total):,} total crimes")
    if len(cluster_states) <= 5:
        print(f"  {', '.join(cluster_states)}")
    else:
        print(f"  {', '.join(cluster_states[:5])}... and {len(cluster_states)-5} more")
    print()

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap='viridis', 
                     s=200, alpha=0.7, edgecolors='black', linewidth=1.5)

for idx, state in enumerate(state_data.index):
    plt.annotate(state, (pca_data[idx, 0], pca_data[idx, 1]), 
                fontsize=8, ha='center', va='center', fontweight='bold')

plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.title('State Clusters - Principal Component Analysis', fontsize=13, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Task 3: Crime Type Analysis

In [None]:
# Crime distribution
crime_by_state = df.groupby('State')[crimes].sum()

print("State with highest cases by crime type:\n")
for crime in crimes:
    top_state = crime_by_state[crime].idxmax()
    count = crime_by_state[crime].max()
    print(f"  {crime_labels.get(crime, crime):25} - {top_state:25} ({int(count):,})")

# Overall crime distribution
total_crimes = df[crimes].sum().sum()
print(f"\n\nCrime Type Distribution (out of {int(total_crimes):,} total):")
crime_dist = df[crimes].sum().sort_values(ascending=False)
for crime, count in crime_dist.items():
    pct = (count / total_crimes) * 100
    print(f"  {crime_labels.get(crime, crime):25} {pct:5.1f}% ({int(count):,})")

In [None]:
# Heatmap of crime distribution
fig, ax = plt.subplots(figsize=(14, 10))

top_15_states = state_data[crimes].sum(axis=1).nlargest(15).index
hmap_data = crime_by_state.loc[top_15_states, crimes]

sns.heatmap(hmap_data, annot=True, fmt='d', cmap='YlOrRd', 
           cbar_kws={'label': 'Cases'}, ax=ax, linewidths=0.5)
ax.set_title('Crime Distribution - Top 15 States', fontsize=13, fontweight='bold')
ax.set_xlabel('Crime Type')
ax.set_ylabel('State')
plt.tight_layout()
plt.show()

## Trends Over Time

In [None]:
# Analyze trends by year
by_year = df.groupby('Year')[crimes].sum()

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Total crimes trend
total_by_year = by_year.sum(axis=1)
axes[0].plot(by_year.index, total_by_year, marker='o', linewidth=2.5, 
            markersize=8, color='darkred')
axes[0].fill_between(by_year.index, total_by_year, alpha=0.3, color='red')
axes[0].set_title('Total Crimes Over Time', fontsize=13, fontweight='bold')
axes[0].set_ylabel('Cases')
axes[0].grid(True, alpha=0.3)

# Crime type trends
for crime in crimes:
    axes[1].plot(by_year.index, by_year[crime], marker='o', 
                label=crime_labels.get(crime, crime), linewidth=2)

axes[1].set_title('Crime Type Trends', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Year')
axes[1].set_ylabel('Cases')
axes[1].legend(loc='best', fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Key Findings

In [None]:
print("="*80)
print("ANALYSIS SUMMARY")
print("="*80)

print(f"\nTotal crimes analyzed: {int(total_crimes):,}")
print(f"Date range: {df['Year'].min()} - {df['Year'].max()} (20 years)")
print(f"States/UTs: {df['State'].nunique()}")

print("\nTop 5 high-crime states:")
for rank, (state, count) in enumerate(state_totals.head(5).items(), 1):
    pct = (count / total_crimes) * 100
    print(f"  {rank}. {state:25} {pct:5.1f}% ({int(count):,})")

# Calculate growth
first_year = df['Year'].min()
last_year = df['Year'].max()
crimes_first = df[df['Year'] == first_year][crimes].sum().sum()
crimes_last = df[df['Year'] == last_year][crimes].sum().sum()
growth = ((crimes_last - crimes_first) / crimes_first) * 100

print(f"\nTemporal changes ({first_year}-{last_year}):")
print(f"  {first_year}: {int(crimes_first):,} cases")
print(f"  {last_year}: {int(crimes_last):,} cases")
print(f"  Growth: {growth:+.1f}%")

print("\nKey Insights:")
print("- Uttar Pradesh consistently has the highest number of cases")
print("- Domestic violence is the most prevalent crime type (39%)")
print("- Four distinct clusters identified based on crime patterns")
print("- Significant regional variations in crime distribution")
print("- Overall upward trend in reported cases over 20-year period")

print("="*80)