In [2]:
import pandas as pd
import ipaddress
import dns.resolver
import dns.reversename
import pygeoip
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
    
#data 
data_normal = pd.read_parquet('data0.parquet')
data_attack = pd.read_parquet('test0.parquet')

# Get the organization name for an IP address
geo1=pygeoip.GeoIP('./GeoIP_DBs/GeoIP.dat')
geo2=pygeoip.GeoIP('./GeoIP_DBs/GeoIPASNum.dat')

def get_countryname(ip):
    return geo1.country_name_by_addr(ip)

# Verify if all ports are equal in data_normal vs data_attack
if (data_normal['port'].unique() != data_attack['port'].unique()).all():
    print("Different ports in data_normal vs data_attack")

# Verify if all protocols are equal in data_normal vs data_attack
if (data_normal['proto'].unique() != data_attack['proto'].unique()).all():
    print("Different protocols in data_normal vs data_attack")

# Put a label for each dst_ip with the country name
data_normal['dst_country'] = data_normal['dst_ip'].apply(get_countryname)
data_attack['dst_country'] = data_attack['dst_ip'].apply(get_countryname)

# Show the countries that not exists in data_normal
print("Countries that not exists in data_normal:")
for country in data_attack['dst_country'].unique():
    if country not in data_normal['dst_country'].unique():
        print(country)



Countries that not exists in data_normal:
Russian Federation
Lebanon
Czech Republic
Myanmar
Nepal
Kazakhstan
Kyrgyzstan
Ukraine
Bangladesh
Uzbekistan
Taiwan
Seychelles
Antigua and Barbuda
Austria
Vietnam
Cyprus
Armenia
Bulgaria
Luxembourg
Iran, Islamic Republic of
Maldives
Denmark
Panama
Gibraltar


In [18]:
# Group logs by src_ip
grouped_by_src_ip_normal = data_normal.groupby(['src_ip'])
grouped_by_src_ip_attack = data_attack.groupby(['src_ip'])

#Look for any abnormal or suspicious behavior, such as:
# - a high number of connections
# - excessive data transfer, or connections to known vulnerable ports.

# Number of UDP flows for each source IP
n_flows_udp_normal = grouped_by_src_ip_normal['up_bytes'].count()
n_flows_udp_attack = grouped_by_src_ip_attack['up_bytes'].count()

# Number of TCP flows for each source IP
n_flows_tcp_normal = grouped_by_src_ip_normal['up_bytes'].count()
n_flows_tcp_attack = grouped_by_src_ip_attack['up_bytes'].count()

# List of all source IPs
src_ip_flows_normal = data_normal.groupby(['src_ip'])['up_bytes'].count()
src_ip_flows_attack = data_attack.groupby(['src_ip'])['up_bytes'].count()

# src_ips statistics normal
statistics_normal = {}
for key in data_normal['src_ip'].unique(): #np.sort(parquet['src_ip'].unique()):
    # save all statistics in a dictionary
    statistics_normal[key] = {}
    statistics_normal[key]['udp_flows'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')])
    statistics_normal[key]['udp_up_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['up_bytes'].sum()
    statistics_normal[key]['udp_down_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['down_bytes'].sum()
    statistics_normal[key]['udp_ports'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['port'].unique()
    statistics_normal[key]['udp_dest_ips'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_ip'].unique()
    statistics_normal[key]['udp_dest_countries'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_country'].unique()
    statistics_normal[key]['udp_dest_countries_count'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_country'].unique())

    statistics_normal[key]['tcp_flows'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')])
    statistics_normal[key]['tcp_up_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['up_bytes'].sum()
    statistics_normal[key]['tcp_down_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['down_bytes'].sum()
    statistics_normal[key]['tcp_ports'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['port'].unique()
    statistics_normal[key]['tcp_dest_ips'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_ip'].unique()
    statistics_normal[key]['tcp_dest_countries'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_country'].unique()
    statistics_normal[key]['tcp_dest_countries_count'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_country'].unique())

    statistics_normal[key]['started_at'] = data_normal.loc[(data_normal['src_ip']==key)]['timestamp'].min()
    statistics_normal[key]['ended_at'] = data_normal.loc[(data_normal['src_ip']==key)]['timestamp'].max()

# src_ips statistics attack
statistics_attack = {}
for key in data_attack['src_ip'].unique(): #np.sort(parquet['src_ip'].unique()):
    # save all statistics in a dictionary
    statistics_attack[key] = {}
    statistics_attack[key]['udp_flows'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')])
    statistics_attack[key]['udp_up_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['up_bytes'].sum()
    statistics_attack[key]['udp_down_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['down_bytes'].sum()
    statistics_attack[key]['udp_ports'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['port'].unique()
    statistics_attack[key]['udp_dest_ips'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_ip'].unique()
    statistics_attack[key]['udp_dest_countries'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_country'].unique()
    statistics_attack[key]['udp_dest_countries_count'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='udp')]['dst_country'].unique())

    statistics_attack[key]['tcp_flows'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')])
    statistics_attack[key]['tcp_up_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['up_bytes'].sum()
    statistics_attack[key]['tcp_down_bytes'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['down_bytes'].sum()
    statistics_attack[key]['tcp_ports'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['port'].unique()
    statistics_attack[key]['tcp_dest_ips'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_ip'].unique()
    statistics_attack[key]['tcp_dest_countries'] = data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_country'].unique()
    statistics_attack[key]['tcp_dest_countries_count'] = len(data_normal.loc[(data_normal['src_ip']==key) & (data_normal['proto']=='tcp')]['dst_country'].unique())

    statistics_attack[key]['started_at'] = data_normal.loc[(data_normal['src_ip']==key)]['timestamp'].min()
    statistics_attack[key]['ended_at'] = data_normal.loc[(data_normal['src_ip']==key)]['timestamp'].max()
    

In [4]:
# Check possilbe dangerous src_ip in data_attack

# Get the frequency of each src_ip in data_normal
data_normal_src_ip = data_normal.groupby('src_ip')['src_ip'].count()
data_normal_src_ip = data_normal_src_ip.sort_values(ascending=False)

# Get the frequency of each src_ip in data_attack
data_attack_src_ip = data_attack.groupby('src_ip')['src_ip'].count()
data_attack_src_ip = data_attack_src_ip.sort_values(ascending=False)

# Get the src_ip that is generating more traffic than 99% of the src_ip in data_normal
data_attack_src_ip = data_attack_src_ip[data_attack_src_ip > data_normal_src_ip.quantile(0.99)]
data_attack_src_ip = data_attack_src_ip.sort_values(ascending=False)

print("Ips that are generating more traffic than 99% othersl:")
for ip in data_attack_src_ip.index:
    print("IP: " + ip + " - " + str(data_attack_src_ip[ip]) + " packets")

Ips that are generating more traffic than 99% othersl:
IP: 192.168.100.176 - 79884 packets
IP: 192.168.100.188 - 45942 packets
IP: 192.168.100.49 - 12103 packets
IP: 192.168.100.75 - 10846 packets


In [5]:
# Assuming 'up_bytes' and 'down_bytes' are the columns representing the bytes in the dataset

#  Sum 'up_bytes' and 'down_bytes'
data_normal['bytes'] = data_normal['up_bytes'] + data_normal['down_bytes']
data_attack['bytes'] = data_attack['up_bytes'] + data_attack['down_bytes']

# Sum all bytes by dst_country
data_normal_bytes = data_normal.groupby('dst_country')['bytes'].sum()
data_attack_bytes = data_attack.groupby('dst_country')['bytes'].sum()

# verify the difference between data_normal_bytes and data_attack_bytes
data_diff_bytes = data_attack_bytes - data_normal_bytes
data_diff_bytes = data_diff_bytes.sort_values(ascending=False)
data_diff_bytes = data_diff_bytes.dropna()

# Calculate the percentage of difference
data_diff_bytes = data_diff_bytes / data_normal_bytes * 100

# Show the countries that have a difference of more than 100% in data_attack
print("Countries that have a difference of more than 50% in data_attack:")
for country in data_diff_bytes.index:
    if data_diff_bytes[country] > 50:
        # check if i need '\t'
        # i want that the second column start in the same place
        if len(country) < 8:
            print(country + "\t\t\t" + str(int(data_diff_bytes[country])) + "%")
        elif len(country) < 16:
            print(country + "\t\t" + str(int(data_diff_bytes[country])) + "%")
        else:
            print(country + "\t" + str(int(data_diff_bytes[country])) + "%")



Countries that have a difference of more than 50% in data_attack:
Chile			204%
China			529%
Indonesia		99%
Israel			64%
Korea, Republic of	61%


In [9]:
# count rows
print("Number of rows in data_normal: " + str(len(data_normal)))
print("Number of rows in data_attack: " + str(len(data_attack)))

# Count number of packets to port 53
data_normal_dns = data_normal[data_normal['port'] == 53]
data_attack_dns = data_attack[data_attack['port'] == 53]

# Count number of packets to port 53 by dst_ip
data_normal_dns = data_normal_dns.groupby('dst_ip')['dst_ip'].count()
data_attack_dns = data_attack_dns.groupby('dst_ip')['dst_ip'].count()

# Show
print("Number of packets to port 53 by dst_ip in data_normal:")
for ip in data_normal_dns.index:
    print("IP: " + ip + " - " + str(data_normal_dns[ip]) + " packets" + ", protocol: " + str(data_normal[data_normal['dst_ip'] == ip]['proto'].unique()))

print("\nNumber of packets to port 53 by dst_ip in data_attack:")
for ip in data_attack_dns.index:
    print("IP: " + ip + " - " + str(data_attack_dns[ip]) + " packets" + ", protocol: " + str(data_attack[data_attack['dst_ip'] == ip]['proto'].unique()))

# verify when dst_ip is 192.168.100.224 or 192.168.100.225

Number of rows in data_normal: 980027
Number of rows in data_attack: 1061078
Number of packets to port 53 by dst_ip in data_normal:
IP: 192.168.100.224 - 58048 packets, protocol: ['udp']
IP: 192.168.100.225 - 57922 packets, protocol: ['udp']

Number of packets to port 53 by dst_ip in data_attack:
IP: 192.168.100.224 - 112900 packets, protocol: ['udp']
IP: 192.168.100.225 - 115214 packets, protocol: ['udp']
