## **Importing Libraries**

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

## **Loading Data**

In [3]:
df = pd.read_csv('data.csv')

## **Data Shape**

In [4]:
print('Number of rows: ', df.shape[0])
print('Number of columns: ', df.shape[1])

Number of rows:  3000
Number of columns:  10


In [None]:
df.head()

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours)
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68


## **Summary Statistics**

In [8]:
df.describe()

Unnamed: 0,Year,Financial Loss (in Million $),Number of Affected Users,Incident Resolution Time (in Hours)
count,3000.0,3000.0,3000.0,3000.0
mean,2019.570333,50.49297,504684.136333,36.476
std,2.857932,28.791415,289944.084972,20.570768
min,2015.0,0.5,424.0,1.0
25%,2017.0,25.7575,255805.25,19.0
50%,2020.0,50.795,504513.0,37.0
75%,2022.0,75.63,758088.5,55.0
max,2024.0,99.99,999635.0,72.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Country                              3000 non-null   object 
 1   Year                                 3000 non-null   int64  
 2   Attack Type                          3000 non-null   object 
 3   Target Industry                      3000 non-null   object 
 4   Financial Loss (in Million $)        3000 non-null   float64
 5   Number of Affected Users             3000 non-null   int64  
 6   Attack Source                        3000 non-null   object 
 7   Security Vulnerability Type          3000 non-null   object 
 8   Defense Mechanism Used               3000 non-null   object 
 9   Incident Resolution Time (in Hours)  3000 non-null   int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 234.5+ KB


## **Variability**

## **Distribution of Attac Types**

In [17]:
attack_counts = df['Attack Type'].value_counts().reset_index()

In [22]:
attack_counts = df['Attack Type'].value_counts().reset_index()
attack_counts.columns = ['Attack Type', 'Count']

fig = px.bar(
    attack_counts, 
    x='Attack Type', 
    y='Count',
    title='Distribution of Attack Types',
    color='Attack Type',
    text_auto=True,
    width=800,
    height=500,
    color_discrete_sequence=px.colors.sequential.Cividis_r

)

fig.show()

## **Financial Loss Distribution**

In [26]:
fig = px.histogram(
    df,
    x = 'Financial Loss (in Million $)',
    nbins=10,
    title='Financial Loss Distribution',
     width=800,
    height=500

)
fig.show()

## **Count of Attacks by Country**

In [36]:
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Count']

fig = px.bar(
    country_counts, 
    x='Country',
    y='Count',
    title='Number of Attacks by Country', 
    color='Country',
    width=800, 
    height=500,
    text_auto=True
)

fig.show()

## **Number of Attacks by Year**

In [39]:
year_count = df['Year'].value_counts().reset_index()
year_count.columns = ['Year', 'Count']
year_count = year_count.sort_values('Year')

fig = px.bar(
    year_count,
    x = 'Year',
    y = 'Count',
    title='Number of Attacks by Year', 
    color='Year',
    text_auto=True,
    width=800,
    height=500
)
fig.show()

## **Bivariate Analysis**

### **Financial Loss of Attack Type**

In [43]:
attack_loss = df.groupby('Attack Type')['Financial Loss (in Million $)'].mean().reset_index()

plt = px.bar(
    attack_loss,
    x = 'Attack Type',
    y = 'Financial Loss (in Million $)',
    title = 'Average Financial Loss by Attack type',
    color='Attack Type',
    text_auto=True,
    width=800,
    height=500


)

plt.show()

## **Number of Affected User by Attack Type**

In [44]:
df.groupby('Attack Type')['Number of Affected Users'].mean().reset_index()

Unnamed: 0,Attack Type,Number of Affected Users
0,DDoS,499437.410546
1,Malware,508780.23299
2,Man-in-the-Middle,520064.320261
3,Phishing,487179.536862
4,Ransomware,502825.369168
5,SQL Injection,512469.833002


In [47]:
df.groupby('Country')['Financial Loss (in Million $)'].mean().reset_index()

Unnamed: 0,Country,Financial Loss (in Million $)
0,Australia,51.861953
1,Brazil,50.911677
2,China,48.805943
3,France,49.089443
4,Germany,54.272302
5,India,47.292597
6,Japan,49.827344
7,Russia,49.948237
8,UK,51.411184
9,USA,51.610174


In [48]:
df.groupby('Target Industry')['Financial Loss (in Million $)'].mean().reset_index()

Unnamed: 0,Target Industry,Financial Loss (in Million $)
0,Banking,51.17391
1,Education,47.903174
2,Government,52.618685
3,Healthcare,49.047296
4,IT,51.90341
5,Retail,49.928014
6,Telecommunications,50.766973


In [50]:
df.groupby('Defense Mechanism Used')['Incident Resolution Time (in Hours)'].mean().reset_index()

Unnamed: 0,Defense Mechanism Used,Incident Resolution Time (in Hours)
0,AI-based Detection,36.61235
1,Antivirus,36.573248
2,Encryption,36.589527
3,Firewall,35.71453
4,VPN,36.864379


In [46]:
df.head()

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours)
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68


# **Multivariate Analysis**

## **Financial Loss by Vulnerability Type and Attack Source**

In [53]:
voulner_Loss = df.groupby(['Security Vulnerability Type', 'Attack Source'])['Financial Loss (in Million $)'].mean().reset_index()

plot = px.bar(
    voulner_Loss,
    x = 'Security Vulnerability Type',
    y= 'Financial Loss (in Million $)',
    color='Attack Source',
    title= 'Financial Loss by Vulnerability Type and Attack Source',
    barmode='group',
    text_auto=True,
    width=900,
    height=500
)

plot.show()

## **Trends of Cyber Attacks over the year**

In [58]:

timeline = df.groupby('Year').size().reset_index(name='Number of Attacks')


plot = px.line(
    timeline, 
    x='Year', 
    y='Number of Attacks',
    title='Trend of Cyber Attacks Over Years',
    markers=True,
    line_shape='linear'
)
plot.show()

In [64]:
df['Cost Per User'] = (df['Financial Loss (in Million $)']*1000000) / df['Number of Affected Users']

Cost_Per_User = df.groupby('Attack Type')['Cost Per User'].mean().reset_index()

plot = px.bar(
    Cost_Per_User,
    x = 'Attack Type',
    y = 'Cost Per User',
    color='Attack Type',
    title='No Title',
    text_auto=True,
    width=800,
    height=500
)

plot.show()

## **Incident Response and Resolution**

In [76]:
defence_resolution = df.groupby('Defense Mechanism Used')['Incident Resolution Time (in Hours)'].agg(['mean', 'min', 'max']).reset_index()

fig = make_subplots(rows=1, cols=1)

for defence in df['Defense Mechanism Used'].unique():
    defence_data = df[df['Defense Mechanism Used'] == defence]['Incident Resolution Time (in Hours)']
    fig.add_trace(
        go.Box(
            y = defence_data,
            name = defence,
            boxpoints='all',
            jitter=0.3,
            pointpos=1.8
        
        )

    )

fig.update_layout(
    title_text = 'Resolution Time distribution by Defence Mechanism',
    yaxis_title = 'Incident Resolution Time (hours)',
    showlegend = True
)




fig.show()

In [66]:
df.head()

Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours),Cost Per User
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63,104.155754
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71,210.129037
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20,63.789931
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7,62.852636
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68,91.786915
