In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

# 1. Load and clean data
file_path = r"C:\Users\91944\Desktop\Research\Crime\All datasets\cyber-crimes-from-ncrb-master-data-year-state-and-city-wise-total-number-of-cyber-crimes-committed-in-india.csv"
df = pd.read_csv(file_path)

# Filter for 2018-2022, drop 'Total Cities', and drop missing states/values
df = df[(df['year'] >= 2018) & (df['year'] <= 2022)]
df = df[df['city'] != 'Total Cities']
df = df.dropna(subset=['state', 'value'])

# Assign regions
western_states = ['Maharashtra', 'Gujarat', 'Rajasthan']
eastern_states = ['West Bengal', 'Bihar', 'Odisha', 'Jharkhand']
def get_region(state):
    if state in western_states:
        return 'West'
    elif state in eastern_states:
        return 'East'
    else:
        return None
df['region'] = df['state'].apply(get_region)
df = df[df['region'].notnull()]

# 2. Choropleth-style bar chart: State cybercrime density
state_totals = df.groupby('state')['value'].sum().sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(y=state_totals.index, x=state_totals.values, hue=state_totals.index, palette='viridis', legend=False)
plt.title('Cybercrime Density (2018-2022) in Western & Eastern States')
plt.xlabel('Total Cybercrimes')
plt.ylabel('State')
plt.tight_layout()
plt.savefig('choropleth_density_proxy.png', dpi=300)
plt.close()

# 3. Scatterplot Matrix with Regression Lines
pivot = df.pivot_table(index=['state','year'], values='value', aggfunc='sum').reset_index()
sns.pairplot(pivot, hue='state', diag_kind='kde')
plt.suptitle('Scatterplot Matrix of Cybercrimes by State and Year', y=1.02)
plt.savefig('scatterplot_matrix.png', dpi=300)
plt.close()

# 4. Time Series Comparison with Regional Aggregation
region_year = df.groupby(['region', 'year'])['value'].sum().reset_index()
plt.figure(figsize=(8,5))
sns.lineplot(data=region_year, x='year', y='value', hue='region', marker='o')
plt.title('Time Series of Cybercrimes: West vs East (2018-2022)')
plt.xlabel('Year')
plt.ylabel('Total Cybercrimes')
plt.tight_layout()
plt.savefig('regional_timeseries.png', dpi=300)
plt.close()

# 5. Stacked Bar Chart of Cybercrime by Region (using cities as subgroups)
stacked = df.groupby(['region', 'year', 'city'])['value'].sum().reset_index()
pivot_stacked = stacked.pivot_table(index=['year','region'], columns='city', values='value', fill_value=0)
pivot_stacked.plot(kind='bar', stacked=True, figsize=(14,7), colormap='tab20')
plt.title('Stacked Bar Chart of Cybercrime Distribution by City (Grouped by Region)')
plt.xlabel('Year, Region')
plt.ylabel('Number of Cybercrimes')
plt.tight_layout()
plt.savefig('stacked_bar_by_region.png', dpi=300)
plt.close()

# 6. Multiple Regression Analysis
reg_data = region_year.copy()
reg_data['region_code'] = reg_data['region'].map({'West':0, 'East':1})
X = reg_data[['year', 'region_code']]
y = reg_data['value']
model = LinearRegression()
model.fit(X, y)
print("Regression coefficients:")
print(f"Intercept: {model.intercept_:.2f}")
print(f"Year coefficient: {model.coef_[0]:.2f}")
print(f"Region (East=1) coefficient: {model.coef_[1]:.2f}")

Regression coefficients:
Intercept: -832446.40
Year coefficient: 414.15
Region (East=1) coefficient: -3773.80
