In [13]:
# Import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
%pip install nbformat


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [15]:
# Cell 2: Load and inspect data, STatistics, Cleaning of Data
df = pd.read_csv("/Users/ravikantsaraf/Desktop/data Analysis Project/data/raw/cyber_threats.csv")
print("Data shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())

print("Summary statistics:")
print(df.describe())

import sys
sys.path.append('/Users/ravikantsaraf/Desktop/data Analysis Project/src')


from data_cleaner import clean_cyber_data

cleaned_df = clean_cyber_data('/Users/ravikantsaraf/Desktop/data Analysis Project/data/raw/cyber_threats.csv', '/Users/ravikantsaraf/Desktop/data Analysis Project/data/processed/cyber_threats_cleaned.csv')



# Global trends over time
yearly_stats = cleaned_df.groupby('Year').agg({
    'Financial Loss (in Million $)': 'sum',
    'Number of Affected Users': 'sum',
    'Incident Resolution Time (in Hours)': 'mean'
}).reset_index()

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Total Financial Loss by Year', 'Total Affected Users by Year', 
                   'Mean Resolution Time by Year')
)

fig.add_trace(
    go.Bar(x=yearly_stats['Year'], y=yearly_stats['Financial Loss (in Million $)'], name="Financial Loss"),
    row=1, col=1
)

fig.add_trace(
    go.Bar(x=yearly_stats['Year'], y=yearly_stats['Number of Affected Users'], name="Affected Users"),
    row=1, col=2
)

fig.add_trace(
    go.Scatter(x=yearly_stats['Year'], y=yearly_stats['Incident Resolution Time (in Hours)'], 
               name="Resolution Time", mode='lines+markers'),
    row=2, col=1
)

fig.update_layout(height=600, showlegend=False, title_text="Global Cybersecurity Trends Over Time")
fig.show()





Data shape: (3000, 10)

Column names:
['Country', 'Year', 'Attack Type', 'Target Industry', 'Financial Loss (in Million $)', 'Number of Affected Users', 'Attack Source', 'Security Vulnerability Type', 'Defense Mechanism Used', 'Incident Resolution Time (in Hours)']

Data types:
Country                                 object
Year                                     int64
Attack Type                             object
Target Industry                         object
Financial Loss (in Million $)          float64
Number of Affected Users                 int64
Attack Source                           object
Security Vulnerability Type             object
Defense Mechanism Used                  object
Incident Resolution Time (in Hours)      int64
dtype: object

Missing values:
Country                                0
Year                                   0
Attack Type                            0
Target Industry                        0
Financial Loss (in Million $)          0
Number of Affec

In [16]:
# Top countries by financial loss
country_loss = cleaned_df.groupby('Country')['Financial Loss (in Million $)'].sum().reset_index()
country_loss = country_loss.sort_values('Financial Loss (in Million $)', ascending=False).head(10)

fig = px.bar(country_loss, x='Country', y='Financial Loss (in Million $)',
             title='Top 10 Countries by Financial Loss from Cyber Attacks')
fig.show()

# Industry analysis
industry_stats = cleaned_df.groupby('Target Industry').agg({
    'Financial Loss (in Million $)': 'sum',
    'Number of Affected Users': 'sum',
    'Incident Resolution Time (in Hours)': 'mean'
}).reset_index()

fig = px.treemap(industry_stats, path=['Target Industry'], 
                 values='Financial Loss (in Million $)',
                 title='Financial Loss by Industry')
fig.show()



In [17]:
# Attack type analysis
attack_type_stats = cleaned_df.groupby('Attack Type').size().reset_index(name='Count')
attack_type_stats = attack_type_stats.sort_values('Count', ascending=False)

fig = px.pie(attack_type_stats, values='Count', names='Attack Type',
             title='Distribution of Attack Types')
fig.show()

# Defense mechanism effectiveness
defense_stats = cleaned_df.groupby('Defense Mechanism Used')['Incident Resolution Time (in Hours)'].mean().reset_index()
defense_stats = defense_stats.sort_values('Incident Resolution Time (in Hours)')

fig = px.bar(defense_stats, x='Defense Mechanism Used', y='Incident Resolution Time (in Hours)',
             title='Average Resolution Time by Defense Mechanism')
fig.show()

# Correlation analysis
numeric_cols = ['Financial Loss (in Million $)', 'Number of Affected Users', 'Incident Resolution Time (in Hours)']
corr_matrix = cleaned_df[numeric_cols].corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='RdBu_r',
    zmin=-1,
    zmax=1
))

fig.update_layout(title='Correlation Matrix of Numerical Variables')
fig.show()