In [1]:
import pandas as pd
import ipaddress
import dns.resolver
import dns.reversename
import pygeoip
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
    
#data 
data_normal = pd.read_parquet('data0.parquet')
data_attack = pd.read_parquet('test0.parquet')

# Get the organization name for an IP address
geo1=pygeoip.GeoIP('./GeoIP_DBs/GeoIP.dat')
geo2=pygeoip.GeoIP('./GeoIP_DBs/GeoIPASNum.dat')

def get_countryname(ip):
    return geo1.country_name_by_addr(ip)

# Verify if all ports are equal in data_normal vs data_attack
if (data_normal['port'].unique() != data_attack['port'].unique()).all():
    print("Different ports in data_normal vs data_attack")

# Verify if all protocols are equal in data_normal vs data_attack
if (data_normal['proto'].unique() != data_attack['proto'].unique()).all():
    print("Different protocols in data_normal vs data_attack")

# Put a label for each dst_ip with the country name
data_normal['dst_country'] = data_normal['dst_ip'].apply(get_countryname)
data_attack['dst_country'] = data_attack['dst_ip'].apply(get_countryname)

# Show the countries that not exists in data_normal
print("Countries that not exists in data_normal:")
for country in data_attack['dst_country'].unique():
    if country not in data_normal['dst_country'].unique():
        print(country)



Countries that not exists in data_normal:
Russian Federation
Lebanon
Czech Republic
Myanmar
Nepal
Kazakhstan
Kyrgyzstan
Ukraine
Bangladesh
Uzbekistan
Taiwan
Seychelles
Antigua and Barbuda
Austria
Vietnam
Cyprus
Armenia
Bulgaria
Luxembourg
Iran, Islamic Republic of
Maldives
Denmark
Panama
Gibraltar


In [2]:
# Percentage of packets with each protocol in data_normal
data_normal_proto = data_normal.groupby('proto')['proto'].count()
data_normal_proto = data_normal_proto / data_normal_proto.sum() * 100

# Percentage of packets with each protocol in data_attack
data_attack_proto = data_attack.groupby('proto')['proto'].count()
data_attack_proto = data_attack_proto / data_attack_proto.sum() * 100

# Show 
print("Percentage of packets with each protocol in data_normal:")
for proto in data_normal_proto.index:
    print(proto + "\t" + str(round(data_normal_proto[proto], 2)) + "%")


print("\nPercentage of packets with each protocol in data_attack:")
for proto in data_attack_proto.index:
    print(proto + "\t" + str(round(data_attack_proto[proto], 2)) + "%")

Percentage of packets with each protocol in data_normal:
tcp	87.93%
udp	12.07%

Percentage of packets with each protocol in data_attack:
tcp	78.28%
udp	21.72%


In [3]:
# Check possilbe dangerous src_ip in data_attack

# Get the frequency of each src_ip in data_normal
data_normal_src_ip = data_normal.groupby('src_ip')['src_ip'].count()
data_normal_src_ip = data_normal_src_ip.sort_values(ascending=False)

# Get the frequency of each src_ip in data_attack
data_attack_src_ip = data_attack.groupby('src_ip')['src_ip'].count()
data_attack_src_ip = data_attack_src_ip.sort_values(ascending=False)

# Get the src_ip that is generating more traffic than 99% of the src_ip in data_normal
data_attack_src_ip = data_attack_src_ip[data_attack_src_ip > data_normal_src_ip.quantile(0.99)]
data_attack_src_ip = data_attack_src_ip.sort_values(ascending=False)

print("Ips that are generating more traffic than 99% othersl:")
for ip in data_attack_src_ip.index:
    print("IP: " + ip + " - " + str(data_attack_src_ip[ip]) + " packets")

Ips that are generating more traffic than 99% othersl:
IP: 192.168.100.176 - 79884 packets
IP: 192.168.100.188 - 45942 packets
IP: 192.168.100.49 - 12103 packets
IP: 192.168.100.75 - 10846 packets


In [4]:
# Assuming 'up_bytes' and 'down_bytes' are the columns representing the bytes in the dataset

#  Sum 'up_bytes' and 'down_bytes'
data_normal['bytes'] = data_normal['up_bytes'] + data_normal['down_bytes']
data_attack['bytes'] = data_attack['up_bytes'] + data_attack['down_bytes']

# Sum all bytes by dst_country
data_normal_bytes = data_normal.groupby('dst_country')['bytes'].sum()
data_attack_bytes = data_attack.groupby('dst_country')['bytes'].sum()

# verify the difference between data_normal_bytes and data_attack_bytes
data_diff_bytes = data_attack_bytes - data_normal_bytes
data_diff_bytes = data_diff_bytes.sort_values(ascending=False)
data_diff_bytes = data_diff_bytes.dropna()

# Calculate the percentage of difference
data_diff_bytes = data_diff_bytes / data_normal_bytes * 100

# Show the countries that have a difference of more than 100% in data_attack
print("Countries that have a difference of more than 50% in data_attack:")
for country in data_diff_bytes.index:
    if data_diff_bytes[country] > 50:
        # check if i need '\t'
        # i want that the second column start in the same place
        if len(country) < 8:
            print(country + "\t\t\t" + str(int(data_diff_bytes[country])) + "%")
        elif len(country) < 16:
            print(country + "\t\t" + str(int(data_diff_bytes[country])) + "%")
        else:
            print(country + "\t" + str(int(data_diff_bytes[country])) + "%")



Countries that have a difference of more than 50% in data_attack:
Chile			204%
China			529%
Indonesia		99%
Israel			64%
Korea, Republic of	61%
