In [1]:
#command installs the pandas, sqlalchemy, and mysql-connector-python libraries
!pip install pandas sqlalchemy mysql-connector-python

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels


In [2]:
import pandas as pd
#establish a connection between Python and a database
from sqlalchemy import create_engine
import mysql.connector
import numpy as np
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

In [3]:
import os

In [4]:
file_path=r"Global_Cybersecurity_Threats_2015-2024.csv"

In [5]:
if os.path.exists(file_path):
    try:
        df = pd.read_csv(file_path)
        print("File loaded successfully!")
        print(df.head())
    except FileNotFoundError:
        print("Error: File not found.")

    except PermissionError:
        print("Error: You don’t have permission to access this file.")

    except pd.errors.EmptyDataError:
        print("Error: The file is empty.")

    except pd.errors.ParserError:
        print("Error: Could not parse the file (might be corrupted or not CSV).")

    except ValueError as ve:
        print("ValueError:", ve)

    except Exception as e:
        print("An unexpected error occurred:", e)

else:
    print("Error: File path does not exist.")   

File loaded successfully!
   Country  Year        Attack Type     Target Industry  \
0    China  2019           Phishing           Education   
1    China  2019         Ransomware              Retail   
2    India  2017  Man-in-the-Middle                  IT   
3       UK  2024         Ransomware  Telecommunications   
4  Germany  2018  Man-in-the-Middle                  IT   

   Financial Loss (in Million $)  Number of Affected Users Attack Source  \
0                          80.53                    773169  Hacker Group   
1                          62.19                    295961  Hacker Group   
2                          38.65                    605895  Hacker Group   
3                          41.44                    659320  Nation-state   
4                          74.41                    810682       Insider   

  Security Vulnerability Type Defense Mechanism Used  \
0          Unpatched Software                    VPN   
1          Unpatched Software               Firewa

In [6]:
#Removing duplicates
df=df.drop_duplicates()

In [7]:
#Removes rows with NaN values.
df=df.dropna()

In [8]:
df = df.reset_index(drop=True)    #Cleans up the index after any rows or columns have been removed or re-ordered
df.to_csv('CleanedGlobalCyberSecutity.csv', index=False)     #Saves the cleaned dataset as a CSV file
print("Cleaned dataset saved as 'CleanedGlobalCyberSecutity.csv'")

Cleaned dataset saved as 'CleanedGlobalCyberSecutity.csv'


In [9]:
#replaces all missing (NaN) values in your DataFrame with the string "Not Mentioned"
df=df.fillna("Not Mentioned")

In [10]:
df["Country"]=df["Country"].str.title()

In [11]:
df["Attack Type"]=df["Attack Type"].str.lower().str.strip()
df["Target Industry"]=df["Target Industry"].str.title()
df["Attack Source"]=df["Attack Source"].str.title()
df["Defense Mechanism Used"]=df["Defense Mechanism Used"].str.title()

In [12]:
df=df.rename(columns={
    "Financial Loss (in Million $)":"Financial_Loss",
    "Number of Affected Users":"Affected_Users",
    "Incident Resolution Time (in Hours)":"Resolution_Hours",
    "Attack Type":"Attack_Type",
    "Target Industry":"Target_Industry",
    "Attack Source":"Attack_Source",
    "Security Vulnerability Type":"Vulnerability_Type",
    "Defense Mechanism Used":"Defense Mechanism"
})

In [13]:
print(df.head())
print(df.info())

   Country  Year        Attack_Type     Target_Industry  Financial_Loss  \
0    China  2019           phishing           Education           80.53   
1    China  2019         ransomware              Retail           62.19   
2    India  2017  man-in-the-middle                  It           38.65   
3       Uk  2024         ransomware  Telecommunications           41.44   
4  Germany  2018  man-in-the-middle                  It           74.41   

   Affected_Users Attack_Source  Vulnerability_Type   Defense Mechanism  \
0          773169  Hacker Group  Unpatched Software                 Vpn   
1          295961  Hacker Group  Unpatched Software            Firewall   
2          605895  Hacker Group      Weak Passwords                 Vpn   
3          659320  Nation-State  Social Engineering  Ai-Based Detection   
4          810682       Insider  Social Engineering                 Vpn   

   Resolution_Hours  
0                63  
1                71  
2                20  
3         

In [14]:
import mysql.connector
from mysql.connector import Error
def create_connection():
    try:
        connection = mysql.connector.connect(
            host='localhost',  # Use your MySQL host (e.g., localhost, or an IP address)
            user='root',
            password='siri%40123',
            database='CyberSecurity_DB'  # Database name to use (create it if not present)
        )

        if connection.is_connected():
            print("Successfully connected to the MySQL database.")
        return connection
    except Error as e:
        print(f"Error: {e}")
        return None

In [15]:
def create_tables(connection):
    cursor = connection.cursor()
    try:
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS countries (
    country_id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100) UNIQUE NOT NULL
);
        """)
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS attack_types (
    attack_type_id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100) UNIQUE NOT NULL
);
        """)
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS industries (
    industry_id INT AUTO_INCREMENT PRIMARY KEY,
    name VARCHAR(100) UNIQUE NOT NULL
);
        """)
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS cyber_threats (
    attack_id INT AUTO_INCREMENT PRIMARY KEY,
    country_id INT,
    attack_type_id INT,
    industry_id INT,
    year INT,
    financial_loss DECIMAL(15, 2),
    affected_users INT,
    resolution_time_hours INT,
    description TEXT,
    FOREIGN KEY (country_id) REFERENCES countries(country_id),
    FOREIGN KEY (attack_type_id) REFERENCES attack_types(attack_type_id),
    FOREIGN KEY (industry_id) REFERENCES industries(industry_id)
);
        """)
        connection.commit()
        print("Tables created successfully.")
    except Error as e:
        print(f"Error creating tables: {e}")


In [16]:
df = pd.read_csv("CleanedGlobalCyberSecutity.csv")  # Replace with your actual cleaned CSV path
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Country                              3000 non-null   object 
 1   Year                                 3000 non-null   int64  
 2   Attack Type                          3000 non-null   object 
 3   Target Industry                      3000 non-null   object 
 4   Financial Loss (in Million $)        3000 non-null   float64
 5   Number of Affected Users             3000 non-null   int64  
 6   Attack Source                        3000 non-null   object 
 7   Security Vulnerability Type          3000 non-null   object 
 8   Defense Mechanism Used               3000 non-null   object 
 9   Incident Resolution Time (in Hours)  3000 non-null   int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 234.5+ KB
None


Unnamed: 0,Country,Year,Attack Type,Target Industry,Financial Loss (in Million $),Number of Affected Users,Attack Source,Security Vulnerability Type,Defense Mechanism Used,Incident Resolution Time (in Hours)
0,China,2019,Phishing,Education,80.53,773169,Hacker Group,Unpatched Software,VPN,63
1,China,2019,Ransomware,Retail,62.19,295961,Hacker Group,Unpatched Software,Firewall,71
2,India,2017,Man-in-the-Middle,IT,38.65,605895,Hacker Group,Weak Passwords,VPN,20
3,UK,2024,Ransomware,Telecommunications,41.44,659320,Nation-state,Social Engineering,AI-based Detection,7
4,Germany,2018,Man-in-the-Middle,IT,74.41,810682,Insider,Social Engineering,VPN,68


In [17]:
#Top countries affected by cyber attacks
top_countries = df['Country'].value_counts().head(10)
print(f"Top 10 Countries Affected:\n {top_countries}")

Top 10 Countries Affected:
 Country
UK           321
Brazil       310
India        308
France       305
Japan        305
Australia    297
Russia       295
Germany      291
USA          287
China        281
Name: count, dtype: int64


In [18]:
#Frequency of different types of threats (e.g., phishing, malware, ransomware)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_").str.replace(r"\(.*?\)", "", regex=True)
print(df.columns)
top_threats = df['attack_type'].value_counts()
print(f"Threat Frequency:\n{top_threats}")

Index(['country', 'year', 'attack_type', 'target_industry', 'financial_loss_',
       'number_of_affected_users', 'attack_source',
       'security_vulnerability_type', 'defense_mechanism_used',
       'incident_resolution_time_'],
      dtype='object')
Threat Frequency:
attack_type
DDoS                 531
Phishing             529
SQL Injection        503
Ransomware           493
Malware              485
Man-in-the-Middle    459
Name: count, dtype: int64


In [19]:
#Year-over-year trends in global cybersecurity incidents
yearly_trends = df['year'].value_counts().sort_index()
print(f"Year-wise Attack Count:\n {yearly_trends}")

Year-wise Attack Count:
 year
2015    277
2016    285
2017    319
2018    310
2019    263
2020    315
2021    299
2022    318
2023    315
2024    299
Name: count, dtype: int64


In [20]:
#Severity levels and their impact by region
severity_by_country = df.groupby('country')['financial_loss_'].sum().sort_values(ascending=False)
print(f"Total Financial Loss by Country:\n {severity_by_country.head(10)}")

Total Financial Loss by Country:
 country
UK           16502.99
Germany      15793.24
Brazil       15782.62
Australia    15403.00
Japan        15197.34
France       14972.28
USA          14812.12
Russia       14734.73
India        14566.12
China        13714.47
Name: financial_loss_, dtype: float64


In [21]:
#Correlation between attack type and sector targeted
attack_vs_sector = df.groupby(['attack_type', 'target_industry']).size().sort_values(ascending=False)
print(f"Attack Type vs Target Industry:\n {attack_vs_sector.head(10)}")

Attack Type vs Target Industry:
 attack_type        target_industry   
Phishing           Banking               96
DDoS               IT                    91
Phishing           Retail                89
                   IT                    89
DDoS               Telecommunications    85
Malware            Healthcare            81
Man-in-the-Middle  IT                    80
SQL Injection      Telecommunications    78
DDoS               Healthcare            78
SQL Injection      IT                    77
dtype: int64


In [22]:
#Summary statistics
from tabulate import tabulate
summary = df.describe()
print("Summary Statistics:")
print(tabulate(summary, headers='keys', tablefmt='grid'))

Summary Statistics:
+-------+------------+-------------------+----------------------------+-----------------------------+
|       |       year |   financial_loss_ |   number_of_affected_users |   incident_resolution_time_ |
| count | 3000       |         3000      |                       3000 |                   3000      |
+-------+------------+-------------------+----------------------------+-----------------------------+
| mean  | 2019.57    |           50.493  |                     504684 |                     36.476  |
+-------+------------+-------------------+----------------------------+-----------------------------+
| std   |    2.85793 |           28.7914 |                     289944 |                     20.5708 |
+-------+------------+-------------------+----------------------------+-----------------------------+
| min   | 2015       |            0.5    |                        424 |                      1      |
+-------+------------+-------------------+--------------------

In [23]:
#Most frequent threat types
from tabulate import tabulate
attack_counts = df['attack_type'].value_counts().head(10)
attack_table = list(zip(attack_counts.index, attack_counts.values))   # Convert to a list of tuples
print("Most Frequent Attack Types:")
print(tabulate(attack_table, headers=["Attack Type", "Count"], tablefmt="fancy_grid"))

Most Frequent Attack Types:
╒═══════════════════╤═════════╕
│ Attack Type       │   Count │
╞═══════════════════╪═════════╡
│ DDoS              │     531 │
├───────────────────┼─────────┤
│ Phishing          │     529 │
├───────────────────┼─────────┤
│ SQL Injection     │     503 │
├───────────────────┼─────────┤
│ Ransomware        │     493 │
├───────────────────┼─────────┤
│ Malware           │     485 │
├───────────────────┼─────────┤
│ Man-in-the-Middle │     459 │
╘═══════════════════╧═════════╛


In [24]:
#Region-wise breakdowns
from tabulate import tabulate
country_counts = df['country'].value_counts().head(10)
country_table = list(zip(country_counts.index, country_counts.values))
print("Region-Wise Attack Count:")
print(tabulate(country_table, headers=["Country", "Attack Count"], tablefmt="fancy_grid"))

Region-Wise Attack Count:
╒═══════════╤════════════════╕
│ Country   │   Attack Count │
╞═══════════╪════════════════╡
│ UK        │            321 │
├───────────┼────────────────┤
│ Brazil    │            310 │
├───────────┼────────────────┤
│ India     │            308 │
├───────────┼────────────────┤
│ France    │            305 │
├───────────┼────────────────┤
│ Japan     │            305 │
├───────────┼────────────────┤
│ Australia │            297 │
├───────────┼────────────────┤
│ Russia    │            295 │
├───────────┼────────────────┤
│ Germany   │            291 │
├───────────┼────────────────┤
│ USA       │            287 │
├───────────┼────────────────┤
│ China     │            281 │
╘═══════════╧════════════════╛


In [25]:
#year-wise breakdowns
from tabulate import tabulate
year_counts = df['year'].value_counts().sort_index()
year_table = list(zip(year_counts.index, year_counts.values))
print("Year-wise Breakdown of Incidents:")
print(tabulate(year_table, headers=["Year", "Incident Count"], tablefmt="fancy_grid"))

Year-wise Breakdown of Incidents:
╒════════╤══════════════════╕
│   Year │   Incident Count │
╞════════╪══════════════════╡
│   2015 │              277 │
├────────┼──────────────────┤
│   2016 │              285 │
├────────┼──────────────────┤
│   2017 │              319 │
├────────┼──────────────────┤
│   2018 │              310 │
├────────┼──────────────────┤
│   2019 │              263 │
├────────┼──────────────────┤
│   2020 │              315 │
├────────┼──────────────────┤
│   2021 │              299 │
├────────┼──────────────────┤
│   2022 │              318 │
├────────┼──────────────────┤
│   2023 │              315 │
├────────┼──────────────────┤
│   2024 │              299 │
╘════════╧══════════════════╛


In [26]:
print(df.columns.tolist())

['country', 'year', 'attack_type', 'target_industry', 'financial_loss_', 'number_of_affected_users', 'attack_source', 'security_vulnerability_type', 'defense_mechanism_used', 'incident_resolution_time_']


In [27]:
#Longest Resolution Time (Top 3)
import pandas as pd
from tabulate import tabulate
columns_to_display = ['incident_id', 'country', 'year', 'financial_loss_']
top_resolution_time = df.sort_values(by='incident_resolution_time_', ascending=False).head(3)[['country', 'year', 'attack_type', 'target_industry', 'financial_loss_', 'number_of_affected_users', 'attack_source', 'security_vulnerability_type', 'defense_mechanism_used', 'incident_resolution_time_']]
print("\nTop 3 Incidents by Longest Resolution Time:")
print(tabulate(top_resolution_time, headers='keys', tablefmt='fancy_grid', showindex=False))


Top 3 Incidents by Longest Resolution Time:
╒═══════════╤════════╤═══════════════╤═══════════════════╤═══════════════════╤════════════════════════════╤═════════════════╤═══════════════════════════════╤══════════════════════════╤═════════════════════════════╕
│ country   │   year │ attack_type   │ target_industry   │   financial_loss_ │   number_of_affected_users │ attack_source   │ security_vulnerability_type   │ defense_mechanism_used   │   incident_resolution_time_ │
╞═══════════╪════════╪═══════════════╪═══════════════════╪═══════════════════╪════════════════════════════╪═════════════════╪═══════════════════════════════╪══════════════════════════╪═════════════════════════════╡
│ Germany   │   2018 │ Phishing      │ Government        │             46.92 │                     610201 │ Nation-state    │ Weak Passwords                │ VPN                      │                          72 │
├───────────┼────────┼───────────────┼───────────────────┼───────────────────┼─────────────────