In [8]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [9]:
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")

In [11]:
df = pd.read_csv("cybersecurity_attacks.csv")

In [12]:
df.head().T

Unnamed: 0,0,1,2,3,4
Timestamp,2023-05-30 06:33:58,2020-08-26 07:08:30,2022-11-13 08:23:25,2023-07-02 10:38:46,2023-07-16 13:11:07
Source IP Address,103.216.15.12,78.199.217.198,63.79.210.48,163.42.196.10,71.166.185.76
Destination IP Address,84.9.164.252,66.191.137.154,198.219.82.17,101.228.192.255,189.243.174.238
Source Port,31225,17245,16811,20018,6131
Destination Port,17616,48166,53600,32534,26646
Protocol,ICMP,ICMP,UDP,UDP,TCP
Packet Length,503,1174,306,385,1462
Packet Type,Data,Data,Control,Data,Data
Traffic Type,HTTP,HTTP,HTTP,HTTP,DNS
Payload Data,Qui natus odio asperiores nam. Optio nobis ius...,Aperiam quos modi officiis veritatis rem. Omni...,Perferendis sapiente vitae soluta. Hic delectu...,Totam maxime beatae expedita explicabo porro l...,Odit nesciunt dolorem nisi iste iusto. Animi v...


In [13]:
# List Columns
df.columns

Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Action Taken', 'Severity Level', 'User Information',
       'Device Information', 'Network Segment', 'Geo-location Data',
       'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts', 'Log Source'],
      dtype='object')

In [14]:
# Shape of data
print(f"There are {df.shape[0]}, row and {df.shape[1]} columns in the dataset")

There are 30536, row and 25 columns in the dataset


In [15]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30536 entries, 0 to 30535
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Timestamp               30536 non-null  object 
 1   Source IP Address       30536 non-null  object 
 2   Destination IP Address  30536 non-null  object 
 3   Source Port             30536 non-null  int64  
 4   Destination Port        30536 non-null  int64  
 5   Protocol                30536 non-null  object 
 6   Packet Length           30536 non-null  int64  
 7   Packet Type             30536 non-null  object 
 8   Traffic Type            30536 non-null  object 
 9   Payload Data            30536 non-null  object 
 10  Malware Indicators      15253 non-null  object 
 11  Anomaly Scores          30535 non-null  float64
 13  Attack Type             30535 non-null  object 
 14  Attack Signature        30535 non-null  object 
 15  Action Taken            30535 non-null

In [16]:
df.isnull().sum().sort_values(ascending=False)

IDS/IPS Alerts            15293
Firewall Logs             15291
Malware Indicators        15283
Proxy Information         15124
Attack Type                   1
Geo-location Data             1
Network Segment               1
Device Information            1
User Information              1
Severity Level                1
Action Taken                  1
Attack Signature              1
Log Source                    1
Anomaly Scores                1
Source IP Address             0
Payload Data                  0
Traffic Type                  0
Packet Type                   0
Packet Length                 0
Protocol                      0
Destination Port              0
Source Port                   0
Destination IP Address        0
Timestamp                     0
dtype: int64

In [17]:
# Missing Value by Percentage
df.isnull().sum() / len(df) * 100

Timestamp                  0.000000
Source IP Address          0.000000
Destination IP Address     0.000000
Source Port                0.000000
Destination Port           0.000000
Protocol                   0.000000
Packet Length              0.000000
Packet Type                0.000000
Traffic Type               0.000000
Payload Data               0.000000
Malware Indicators        50.049122
Anomaly Scores             0.003275
Attack Type                0.003275
Attack Signature           0.003275
Action Taken               0.003275
Severity Level             0.003275
User Information           0.003275
Device Information         0.003275
Network Segment            0.003275
Geo-location Data          0.003275
Proxy Information         49.528425
Firewall Logs             50.075321
IDS/IPS Alerts            50.081871
Log Source                 0.003275
dtype: float64

In [18]:
# Determine recent activity
df['Alerts/Warnings'] = df['Alerts/Warnings'].apply(lambda x: 'yes' if x == 'Alert Triggered' else 'no')

In [19]:
df['Malware Indicators'] = df['Malware Indicators'].apply(lambda x: 'No Detection' if pd.isna(x) else x)

In [20]:
df['Proxy Information'] = df['Proxy Information'].apply(lambda x: 'No proxy' if pd.isna(x) else x)

In [21]:
df['Firewall Logs'] = df['Firewall Logs'].apply(lambda x: 'No Data' if pd.isna(x) else x)

In [22]:
df['IDS/IPS Alerts'] = df['IDS/IPS Alerts'].apply(lambda x: 'No Data' if pd.isna(x) else x)

In [23]:
df.isnull().sum().sort_values(ascending=False)

Log Source                1
Attack Type               1
Geo-location Data         1
Network Segment           1
Device Information        1
User Information          1
Severity Level            1
Action Taken              1
Anomaly Scores            1
Attack Signature          1
IDS/IPS Alerts            0
Firewall Logs             0
Proxy Information         0
Timestamp                 0
Source IP Address         0
Malware Indicators        0
Payload Data              0
Traffic Type              0
Packet Type               0
Packet Length             0
Protocol                  0
Destination Port          0
Source Port               0
Destination IP Address    0
dtype: int64

In [24]:
df['Device Information'].value_counts()

Device Information
Mozilla/5.0 (compatible; MSIE 5.0; Windows 98; Trident/4.1)                                                                                   28
Mozilla/5.0 (compatible; MSIE 6.0; Windows CE; Trident/4.0)                                                                                   28
Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0; Trident/3.0)                                                                               26
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 5.01; Trident/4.1)                                                                              25
Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 6.2; Trident/3.0)                                                                               25
                                                                                                                                              ..
Mozilla/5.0 (iPad; CPU iPad OS 12_4_8 like Mac OS X) AppleWebKit/531.0 (KHTML, like Gecko) FxiOS/13.4c0878.0 Mo

In [25]:
# Extract 'Device'
df['Browser'] = df['Device Information'].str.split('/').str[0]

In [26]:
df['Browser']

0        Mozilla
1        Mozilla
2        Mozilla
3        Mozilla
4        Mozilla
          ...   
30531      Opera
30532    Mozilla
30533      Opera
30534    Mozilla
30535        NaN
Name: Browser, Length: 30536, dtype: object

In [57]:
import re
import pandas as pd

# Sample DataFrame for demonstration
data = {
    'Device Information': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Mozilla/5.0 (Linux; Android 10)',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)',
        None,
        'Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)',
    ]
}

df = pd.DataFrame(data)

# OS and device patterns to search for
patterns = [
    r'Windows',
    r'Linux',
    r'Android',
    r'iPad',
    r'iPod',
    r'iPhone',
    r'Macintosh',
]

def extract_device_or_os(user_agent):
    if not isinstance(user_agent, str):
        return 'Unknown'
    for pattern in patterns:
        match = re.search(pattern, user_agent, re.I)  # re.I makes the search case-insensitive
        if match:
            return match.group()
    return 'Unknown'  # Return 'Unknown' if no patterns match

# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)

# Display the DataFrame
print(df)


                                  Device Information  Device/OS
0          Mozilla/5.0 (Windows NT 10.0; Win64; x64)    Windows
1                    Mozilla/5.0 (Linux; Android 10)      Linux
2  Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...     iPhone
3    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)  Macintosh
4                                               None    Unknown
5    Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)       iPad


In [29]:
df['Browser'].value_counts()

Browser
Mozilla    24352
Opera       6183
Name: count, dtype: int64

In [58]:
import re
import pandas as pd

# Sample DataFrame for demonstration
data = {
    'Device Information': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Mozilla/5.0 (Linux; Android 10)',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)',
        None,
        'Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)',
    ]
}

df = pd.DataFrame(data)

# OS and device patterns to search for
patterns = [
    r'Windows',
    r'Linux',
    r'Android',
    r'iPad',
    r'iPod',
    r'iPhone',
    r'Macintosh',
]

def extract_device_or_os(user_agent):
    if not isinstance(user_agent, str):
        return 'Unknown'
    for pattern in patterns:
        match = re.search(pattern, user_agent, re.I)  # re.I makes the search case-insensitive
        if match:
            return match.group()
    return 'Unknown'  # Return 'Unknown' if no patterns match

# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)

# Check the value counts for the 'Device/OS' column
device_os_counts = df['Device/OS'].value_counts()

# Display the value counts
print(device_os_counts)


Device/OS
Windows      1
Linux        1
iPhone       1
Macintosh    1
Unknown      1
iPad         1
Name: count, dtype: int64


In [31]:
df = df.drop('Device Information', axis = 1)

In [32]:
def extract_time_features(df, Timestamp):
    # Convert timestamp column to datetime if it's not already
    df[Timestamp] = pd.to_datetime(df[Timestamp])

    # Extract time features
    df['Year'] = df[Timestamp].dt.year
    df['Month'] = df[Timestamp].dt.month
    df['Day'] = df[Timestamp].dt.day
    df['Hour'] = df[Timestamp].dt.hour
    df['Minute'] = df[Timestamp].dt.minute
    df['Second'] = df[Timestamp].dt.second
    df['DayOfWeek'] = df[Timestamp].dt.dayofweek

    return df

In [33]:
# Assuming df is your DataFrame
# Call the function and store the result in a new DataFrame
new_df = extract_time_features(df, 'Timestamp')

# Check if new columns are created
print(new_df.head())

            Timestamp Source IP Address Destination IP Address  Source Port  \
0 2023-05-30 06:33:58     103.216.15.12           84.9.164.252        31225   
1 2020-08-26 07:08:30    78.199.217.198         66.191.137.154        17245   
2 2022-11-13 08:23:25      63.79.210.48          198.219.82.17        16811   
3 2023-07-02 10:38:46     163.42.196.10        101.228.192.255        20018   
4 2023-07-16 13:11:07     71.166.185.76        189.243.174.238         6131   

   Destination Port Protocol  Packet Length Packet Type Traffic Type  \
0             17616     ICMP            503        Data         HTTP   
1             48166     ICMP           1174        Data         HTTP   
2             53600      UDP            306     Control         HTTP   
3             32534      UDP            385        Data         HTTP   
4             26646      TCP           1462        Data          DNS   

                                        Payload Data  ... IDS/IPS Alerts  \
0  Qui natus odi

In [34]:
df.head().T

Unnamed: 0,0,1,2,3,4
Timestamp,2023-05-30 06:33:58,2020-08-26 07:08:30,2022-11-13 08:23:25,2023-07-02 10:38:46,2023-07-16 13:11:07
Source IP Address,103.216.15.12,78.199.217.198,63.79.210.48,163.42.196.10,71.166.185.76
Destination IP Address,84.9.164.252,66.191.137.154,198.219.82.17,101.228.192.255,189.243.174.238
Source Port,31225,17245,16811,20018,6131
Destination Port,17616,48166,53600,32534,26646
Protocol,ICMP,ICMP,UDP,UDP,TCP
Packet Length,503,1174,306,385,1462
Packet Type,Data,Data,Control,Data,Data
Traffic Type,HTTP,HTTP,HTTP,HTTP,DNS
Payload Data,Qui natus odio asperiores nam. Optio nobis ius...,Aperiam quos modi officiis veritatis rem. Omni...,Perferendis sapiente vitae soluta. Hic delectu...,Totam maxime beatae expedita explicabo porro l...,Odit nesciunt dolorem nisi iste iusto. Animi v...


In [35]:
df.describe(include = 'object').T

Unnamed: 0,count,unique,top,freq
Source IP Address,30536,30536,103.216.15.12,1
Destination IP Address,30536,30536,84.9.164.252,1
Protocol,30536,3,ICMP,10228
Packet Type,30536,2,Control,15426
Traffic Type,30536,3,DNS,10257
Payload Data,30536,30536,Qui natus odio asperiores nam. Optio nobis ius...,1
Malware Indicators,30536,2,No Detection,15283
Alerts/Warnings,30536,2,no,15364
Attack Type,30535,3,DDoS,10256
Attack Signature,30535,2,Known Pattern A,15314


In [36]:
df.columns

Index(['Timestamp', 'Source IP Address', 'Destination IP Address',
       'Source Port', 'Destination Port', 'Protocol', 'Packet Length',
       'Packet Type', 'Traffic Type', 'Payload Data', 'Malware Indicators',
       'Action Taken', 'Severity Level', 'User Information', 'Network Segment',
       'Geo-location Data', 'Proxy Information', 'Firewall Logs',
       'IDS/IPS Alerts', 'Log Source', 'Browser', 'Year', 'Month', 'Day',
       'Hour', 'Minute', 'Second', 'DayOfWeek'],
      dtype='object')

In [37]:
# Checking the Day Column ploting with plotly
plt = px.histogram(df, x = 'Day', color = 'Malware Indicators', title = 'Number of Malware Attacks by Day')
plt.show()

In [38]:
# month Distribution
plt = px.histogram(df, x = 'Month', title = 'Month')
plt.show()

In [39]:
# Checking the Month Column ploting with plotly
plt = px.histogram(df, x = 'Month', color = 'Malware Indicators', title = 'Number of Malware Attacks by Month')
plt.show()

In [40]:
# Year Distrition
plt = px.histogram(df, x='Year', title = 'Year')
plt.show()

In [41]:
# Checking the Day Column ploting with plotly
plt = px.histogram(df, x = 'Year', color = 'Malware Indicators', title = 'Number of Malware Attacks by Year')
plt.show()

In [42]:
# Checking the Protocol distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Protocol', color = 'Malware Indicators', title = 'Number of Malware Attacks by Protocol')
plt.show()

In [43]:
# Traffic Distribution
plt = px.pie(df, names = 'Traffic Type', title = 'Traffic Distribution')
plt.show()

In [44]:
# Ploting the Traffic Type distribution with Bar Chart Using Plotly
plt = px.histogram(df, x = 'Traffic Type', color = 'Malware Indicators', title = 'Number of Malware Attacks by Traffic Type')
plt.show()

In [45]:
# Attack Type Distribution
plt = px.pie(df, names = 'Attack Type', title = 'Attack Type Distribution')
plt.show()

In [46]:
# Checking the attack types distribution with Bar Chart Using Plotly
plt = px.histogram(df, x='Attack Type', color='Traffic Type', title='Number of Malware Attacks by Attack Type')
plt.show()

In [47]:
# Browsers Distribution
plt = px.pie(df, names = 'Browser', title = 'Browser Distribution')
plt.show()

In [59]:
import re
import pandas as pd
import plotly.express as px

# Sample DataFrame for demonstration
data = {
    'Device Information': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Mozilla/5.0 (Linux; Android 10)',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)',
        None,
        'Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)',
    ]
}

df = pd.DataFrame(data)

# OS and device patterns to search for
patterns = [
    r'Windows',
    r'Linux',
    r'Android',
    r'iPad',
    r'iPod',
    r'iPhone',
    r'Macintosh',
]

def extract_device_or_os(user_agent):
    if not isinstance(user_agent, str):
        return 'Unknown'
    for pattern in patterns:
        match = re.search(pattern, user_agent, re.I)  # re.I makes the search case-insensitive
        if match:
            return match.group()
    return 'Unknown'  # Return 'Unknown' if no patterns match

# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)

# Check if the column was successfully created
print(df[['Device Information', 'Device/OS']])

# Platform Distribution
plt = px.pie(df, names='Device/OS', title='Platform Distribution')
plt.show()


                                  Device Information  Device/OS
0          Mozilla/5.0 (Windows NT 10.0; Win64; x64)    Windows
1                    Mozilla/5.0 (Linux; Android 10)      Linux
2  Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...     iPhone
3    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)  Macintosh
4                                               None    Unknown
5    Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)       iPad


In [60]:
import re
import pandas as pd
import plotly.express as px

# Sample DataFrame for demonstration
data = {
    'Device Information': [
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'Mozilla/5.0 (Linux; Android 10)',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)',
        None,
        'Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)',
    ],
    'Browser': [
        'Chrome',
        'Firefox',
        'Safari',
        'Chrome',
        'Edge',
        'Safari'
    ]
}

df = pd.DataFrame(data)

# OS and device patterns to search for
patterns = [
    r'Windows',
    r'Linux',
    r'Android',
    r'iPad',
    r'iPod',
    r'iPhone',
    r'Macintosh',
]

def extract_device_or_os(user_agent):
    if not isinstance(user_agent, str):
        return 'Unknown'
    for pattern in patterns:
        match = re.search(pattern, user_agent, re.I)  # re.I makes the search case-insensitive
        if match:
            return match.group()
    return 'Unknown'  # Return 'Unknown' if no patterns match

# Extract device or OS
df['Device/OS'] = df['Device Information'].apply(extract_device_or_os)

# Verify the DataFrame
print(df[['Device Information', 'Device/OS', 'Browser']])

# Platform Distribution with Bar Chart
plt = px.histogram(df, x='Device/OS', color='Browser', title='Platform Distribution')
plt.show()


                                  Device Information  Device/OS  Browser
0          Mozilla/5.0 (Windows NT 10.0; Win64; x64)    Windows   Chrome
1                    Mozilla/5.0 (Linux; Android 10)      Linux  Firefox
2  Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like M...     iPhone   Safari
3    Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2)  Macintosh   Chrome
4                                               None    Unknown     Edge
5    Mozilla/5.0 (iPad; CPU OS 13_2_3 like Mac OS X)       iPad   Safari


In [51]:
# checking the browser against the attack type
plt = px.histogram(df, x= 'Browser', color='Attack Type', title= 'Number of Attacks by Browser')
plt.show()

In [52]:
# Log Source Distribution
plt = px.histogram(df, x='Log Source', title='Log Source')
plt.show()

In [53]:
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', title='Action Taken')
plt.show()

In [54]:
# Log Source Distribution
plt = px.histogram(df, x='Action Taken', color='Attack Type', title='Log Source')
plt.show()

In [55]:
# Log Source Distribution
plt = px.histogram(df, x='Log Source', color='Attack Type', title='Log Source')
plt.show()