In [1]:
import os
import sys
import json
import ipaddress
import numpy as np
from torch import nn
from sklearn.model_selection import train_test_split

class Resource:
  def __init__(self, RequestID, Site, LoadURL, LoadDomain, 
      Type, MimeType, RemoteIPAddr, ModTime):
    self.request_id = RequestID
    self.site = Site
    self.load_url = LoadURL
    self.load_domain = LoadDomain
    self.resource_type = Type
    self.mimetype = MimeType
    self.ip_addr = RemoteIPAddr
    self.mod_time = ModTime

# Function to parse and create Resources from JSON
def extract_from_JSON(filename):
    ip_to_occurrences = dict()
    ip_to_sites = dict()
    site_date_ipset = dict()
    with open(filename) as f:
        for line in f:
            data = json.loads(line)
            resource = Resource(data["RequestID"], data["Site"], data["LoadURL"], 
                                data["LoadDomain"], data["Type"], data["MimeType"], 
                                data["RemoteIPAddr"], data["ModTime"])
            
            # For ip occurrence filtering in entire dataset
            ip_to_occurrences.setdefault(resource.ip_addr, set())
            occurrence_event = resource.mod_time + " " + resource.site
            ip_to_occurrences[resource.ip_addr].add(occurrence_event)
    
            # For ips unique to a website
            ip_to_sites.setdefault(resource.ip_addr, set())
            ip_to_sites[resource.ip_addr].add(resource.site)
            
            # extracting samples
            site_date_ipset.setdefault(resource.site, dict())
            site_date_ipset[resource.site].setdefault(resource.mod_time, set())
            site_date_ipset[resource.site][resource.mod_time].add(resource.ip_addr)
    return ip_to_occurrences, ip_to_sites, site_date_ipset

In [2]:
filename = "../output/02-06-2020_1000-recurring_output.json"
ip_to_occurrences, ip_to_sites, site_date_ipset = extract_from_JSON(filename)

In [3]:
print(len(site_date_ipset))

974


In [4]:
# filtering passes: 
ips_to_remove = set()
for ip in ip_to_occurrences:
    occurrences = ip_to_occurrences[ip]
    if len(occurrences) < 20:
       ips_to_remove.add(ip)

for ip in ips_to_remove:
    try:
        del ip_to_occurrences[ip]
        del ip_to_sites[ip]
    except KeyError:
        print(ip, " Not Present")
     
unique_ip_to_site = dict()
# Does it make sense to filter the site out?
for ip in ip_to_sites:
    if len(ip_to_sites[ip]) == 1:
        site = ip_to_sites[ip].pop()
        unique_ip_to_site[ip] = site
        try:
            del site_date_ipset[site]
        except KeyError:
            print(site, " Not Present in map")
        try:
            del ip_to_occurrences[ip]
        except KeyError:
            print(ip, " Not Present")

# ip_to_occurrences holds all ips after filtering

104.com.tw  Not Present in map
104.com.tw  Not Present in map
104.com.tw  Not Present in map
104.com.tw  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
11st.co.kr  Not Present in map
123rf.com  Not Present in map
123rf.com  Not Present in map
123rf.com  Not Present in map
123rf.com  Not Present in map
123rf.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
126.com  Not Present in map
1337x.to  Not Present in map
1688.com  Not Present in map
1688.com  Not Present in map
1688.com  Not Present in map
1688.com  Not Present in map
1688.com  Not Present in map
1688.com  Not Pres

daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
daum.net  Not Present in map
dcard.tw  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dcinside.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map
dell.com  Not Present in map

hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdlava.me  Not Present in map
hdzog.com  Not Present in map
hdzog.com  Not Present in map
hdzog.com  Not Present in map
healthline.com  Not Present in map
healthline.com  Not Present in map
hellomagazine.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hepsiburada.com  Not Present in map
hespress.com  Not Present in map
hespress.com  Not Present in map
hh.ru  Not Present in map
hh.ru  Not Present in map
hh.ru  Not Present in map
hh.ru  Not Present in map
hm.com  Not Present in map
hola.com  Not Present in map
homedepot.com  Not Present in map
homedepot.com  Not Present in map
homedepot.com  N

mathworks.com  Not Present in map
mathworks.com  Not Present in map
mathworks.com  Not Present in map
mathworks.com  Not Present in map
mawdoo3.com  Not Present in map
mawdoo3.com  Not Present in map
mawdoo3.com  Not Present in map
mawdoo3.com  Not Present in map
mayoclinic.org  Not Present in map
mayoclinic.org  Not Present in map
mayoclinic.org  Not Present in map
mediafire.com  Not Present in map
mediafire.com  Not Present in map
medium.com  Not Present in map
medium.com  Not Present in map
medium.com  Not Present in map
medium.com  Not Present in map
medium.com  Not Present in map
medium.com  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
mega.nz  Not Present in map
memurlar.net  Not Present in map
memurlar.net  Not Present in map
memurlar.net  Not Present in map
memurlar.net  Not Present in map
memurlar.net  Not Pre

provincial.com  Not Present in map
provincial.com  Not Present in map
provincial.com  Not Present in map
provincial.com  Not Present in map
psychologytoday.com  Not Present in map
ptt.cc  Not Present in map
python.org  Not Present in map
python.org  Not Present in map
python.org  Not Present in map
python.org  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qq.com  Not Present in map
qualtrics.com  Not Present in map
quizlet.com  Not Present in map
quizlet.com  Not Present in map
quizlet.com  Not Present in map
quora.com  Not Present in map
rakuten.co.jp  Not Present in map
rakuten.co.jp  N

target.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
td.com  Not Present in map
teacherspayteachers.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
teamviewer.com  Not Present in map
techradar.com  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in map
telegraph.co.uk  Not Present in ma

yy.com  Not Present in map
zcool.com.cn  Not Present in map
zcool.com.cn  Not Present in map
zcool.com.cn  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zendesk.com  Not Present in map
zhanqi.tv  Not Present in map
zhanqi.tv  Not Present in map
zhanqi.tv  Not Present in map
zhaopin.com  Not Present in map
zhaopin.com  Not Present in map
zhaopin.com  Not Present in map
zhaopin.com  Not Present in map
zhibo8.cc  Not Present in map
zhibo8.cc  Not Present in map
zhibo8.cc  Not Present in map
zhibo8.cc  Not Present in map
zhihu.com  Not Present in map
zhihu.com  Not Present in map
zhihu.com  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in map
zing.vn  Not Present in ma

In [5]:
print(len(site_date_ipset), len())

193


In [6]:
# create site to encoded ipset mapping
labels = []
encodings = []
all_ips = list(ip_to_occurrences.keys())
print(len(all_ips))
for site in site_date_ipset:
    date_to_ipset = site_date_ipset[site]
    for date in date_to_ipset:
        ipset = date_to_ipset[date]
        iparray = [0] * len(all_ips)
        for ip in ipset:
            if ip in ip_to_occurrences:
                iparray[all_ips.index(ip)] = 1
        labels.append(site)
        encodings.append(np.asarray(iparray))

print(len(labels), len(encodings))
                

9760
20786 20786


In [7]:
import pyasn
asndb = pyasn.pyasn('pyasn')
asn_set = set()
ip_to_asn = dict()
for ip in ip_to_occurrences:
    try:
        asn,_= asndb.lookup(ip)
        ip_to_asn[ip] = asn
        asn_set.add(asn)
    except:
        print(ip, " not mapped to asn")
        continue

  not mapped to asn


In [8]:
sites_asn = []
asn_enc = []
asn_list = list(asn_set)
for site in site_date_ipset:
    date_to_ipset = site_date_ipset[site]
    for date in date_to_ipset:
        ipset = date_to_ipset[date]
        asn_array = [0]*len(asn_list)
        for ip in ipset:
            if ip in ip_to_occurrences:
                try:
                    asn_array[asn_list.index(ip_to_asn[ip])] = 1
                except Exception as e:
                    continue
        sites_asn.append(site)
        asn_enc.append(np.asarray(asn_array))

In [12]:
print(len(sites_asn[0]), len(asn_enc[0]))
print(len(labels[0]), len(encodings[0]))

1 346
1 9760


In [13]:
class NNet(nn.Module):
    def __init__(self, input_size, embed_size, output_size):
        super(NNet, self).__init__()
        self.fc1 = nn.Linear(input_size, embed_size)
        self.fc2 = nn.Linear(embed_size, output_size)
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        return x

In [14]:
from sklearn import preprocessing
import torch

tensor_encodings = torch.Tensor(encodings)
le_labels = preprocessing.LabelEncoder()
tensor_labels = le_labels.fit_transform(labels)
tensor_labels = torch.as_tensor(tensor_labels)

tensor_asn_enc = torch.Tensor(asn_enc)
le_asn = preprocessing.LabelEncoder()
tensor_sites_asn = le_asn.fit_transform(sites_asn)
tensor_sites_asn = torch.as_tensor(tensor_sites_asn)


In [15]:
import torch.utils.data as data

ip_dataset = data.TensorDataset(tensor_encodings,tensor_labels)
asn_dataset = data.TensorDataset(tensor_asn_enc,tensor_sites_asn)

In [48]:
batch_size = 64
validation_split = .2
shuffle_dataset = True
random_seed= 42

In [49]:
train_size = int(0.8 * len(asn_dataset))
test_size = len(asn_dataset) - train_size
asn_train_dataset, asn_test_dataset = torch.utils.data.random_split(asn_dataset, [train_size, test_size])
asn_train_loader = torch.utils.data.DataLoader(asn_train_dataset, batch_size=batch_size)
asn_test_loader = torch.utils.data.DataLoader(asn_test_dataset, batch_size=batch_size)
print(len(asn_train_loader.dataset), len(asn_test_loader.dataset))

16628 4158


In [50]:
train_size = int(0.8 * len(ip_dataset))
test_size = len(ip_dataset) - train_size
ip_train_dataset, ip_test_dataset = torch.utils.data.random_split(ip_dataset, [train_size, test_size])
ip_train_loader = torch.utils.data.DataLoader(ip_train_dataset, batch_size=batch_size)
ip_test_loader = torch.utils.data.DataLoader(ip_test_dataset, batch_size=batch_size)
print(len(ip_train_loader.dataset), len(ip_test_loader.dataset))

16628 4158


In [51]:
input_size_ipdataset = len(tensor_encodings[0])
output_size_ipdataset = len(site_date_ipset)
embed_size_ipdataset = 1000

input_size_asndataset = len(tensor_asn_enc[0])
output_size_asndataset = len(site_date_ipset)
embed_size_asndataset = 270
print(input_size_ipdataset, output_size_ipdataset)
print(input_size_asndataset, output_size_asndataset)

9760 193
346 193


In [52]:
model_ip = NNet(input_size_ipdataset, embed_size_ipdataset, output_size_ipdataset)
model_asn = NNet(input_size_asndataset, embed_size_asndataset, output_size_asndataset)

In [53]:
criterion = nn.CrossEntropyLoss(reduction='mean')
optimizer_ip = torch.optim.SGD(model_ip.parameters(), lr=0.1)
optimizer_asn = torch.optim.SGD(model_asn.parameters(), lr=0.1)

In [54]:
from torch.autograd import Variable
import time
device = 'cpu'

def train(epoch, train_loader, model, optimizer):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print('Train Epoch: {} | Batch Status: {}/{} ({:.0f}%) | Loss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(test_loader, model):
    model.eval()
    test_loss = 0
    correct = 0
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        # sum up batch loss
        test_loss += criterion(output, target).item()
        # get the index of the max
        pred = output.data.max(1, keepdim=True)[1]
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    print(f'===========================\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
          f'({100. * correct / len(test_loader.dataset):.0f}%)')

In [55]:
# first ip dataset training and tests
since = time.time()
print(len(ip_train_loader.dataset), len(ip_test_loader.dataset))
for epoch in range(1, 10):
    epoch_start = time.time()
    train(epoch, ip_train_loader, model_ip, optimizer_ip)
    m, s = divmod(time.time() - epoch_start, 60)
    print(f'Training time: {m:.0f}m {s:.0f}s')
    test(ip_test_loader, model_ip)
    m, s = divmod(time.time() - epoch_start, 60)
    print(f'Testing time: {m:.0f}m {s:.0f}s')

m, s = divmod(time.time() - since, 60)
print(f'Total Time: {m:.0f}m {s:.0f}s\nModel was trained on {device}!')


16628 4158
Train Epoch: 1 | Batch Status: 0/16628 (0%) | Loss: 5.263751
Train Epoch: 1 | Batch Status: 640/16628 (4%) | Loss: 5.246414
Train Epoch: 1 | Batch Status: 1280/16628 (8%) | Loss: 5.241113
Train Epoch: 1 | Batch Status: 1920/16628 (12%) | Loss: 5.241874
Train Epoch: 1 | Batch Status: 2560/16628 (15%) | Loss: 5.226511
Train Epoch: 1 | Batch Status: 3200/16628 (19%) | Loss: 5.207438
Train Epoch: 1 | Batch Status: 3840/16628 (23%) | Loss: 5.189809
Train Epoch: 1 | Batch Status: 4480/16628 (27%) | Loss: 5.201111
Train Epoch: 1 | Batch Status: 5120/16628 (31%) | Loss: 5.173931
Train Epoch: 1 | Batch Status: 5760/16628 (35%) | Loss: 5.196321
Train Epoch: 1 | Batch Status: 6400/16628 (38%) | Loss: 5.109958
Train Epoch: 1 | Batch Status: 7040/16628 (42%) | Loss: 5.113437
Train Epoch: 1 | Batch Status: 7680/16628 (46%) | Loss: 5.125197
Train Epoch: 1 | Batch Status: 8320/16628 (50%) | Loss: 5.141915
Train Epoch: 1 | Batch Status: 8960/16628 (54%) | Loss: 5.094893
Train Epoch: 1 | Batc

Train Epoch: 5 | Batch Status: 8960/16628 (54%) | Loss: 2.396067
Train Epoch: 5 | Batch Status: 9600/16628 (58%) | Loss: 2.257113
Train Epoch: 5 | Batch Status: 10240/16628 (62%) | Loss: 2.255646
Train Epoch: 5 | Batch Status: 10880/16628 (65%) | Loss: 2.816883
Train Epoch: 5 | Batch Status: 11520/16628 (69%) | Loss: 2.340191
Train Epoch: 5 | Batch Status: 12160/16628 (73%) | Loss: 2.586683
Train Epoch: 5 | Batch Status: 12800/16628 (77%) | Loss: 2.611402
Train Epoch: 5 | Batch Status: 13440/16628 (81%) | Loss: 2.396200
Train Epoch: 5 | Batch Status: 14080/16628 (85%) | Loss: 2.107030
Train Epoch: 5 | Batch Status: 14720/16628 (88%) | Loss: 2.425694
Train Epoch: 5 | Batch Status: 15360/16628 (92%) | Loss: 2.811232
Train Epoch: 5 | Batch Status: 16000/16628 (96%) | Loss: 2.760034
Training time: 0m 19s
Test set: Average loss: 0.0372, Accuracy: 2030/4158 (49%)
Testing time: 0m 21s
Train Epoch: 6 | Batch Status: 0/16628 (0%) | Loss: 2.180136
Train Epoch: 6 | Batch Status: 640/16628 (4%) | 

In [56]:
# Now asn dataset training and tests
since = time.time()
criterion = nn.CrossEntropyLoss(reduction='mean')
print(len(asn_train_loader.dataset), len(asn_test_loader.dataset))
for epoch in range(1, 10):
    epoch_start = time.time()
    train(epoch, asn_train_loader, model_asn, optimizer_asn)
    m, s = divmod(time.time() - epoch_start, 60)
    print(f'Training time: {m:.0f}m {s:.0f}s')
    test(asn_test_loader, model_asn)
    m, s = divmod(time.time() - epoch_start, 60)
    print(f'Testing time: {m:.0f}m {s:.0f}s')

m, s = divmod(time.time() - since, 60)
print(f'Total Time: {m:.0f}m {s:.0f}s\nModel was trained on {device}!')

16628 4158
Train Epoch: 1 | Batch Status: 0/16628 (0%) | Loss: 5.265094
Train Epoch: 1 | Batch Status: 640/16628 (4%) | Loss: 5.241331
Train Epoch: 1 | Batch Status: 1280/16628 (8%) | Loss: 5.237516
Train Epoch: 1 | Batch Status: 1920/16628 (12%) | Loss: 5.215890
Train Epoch: 1 | Batch Status: 2560/16628 (15%) | Loss: 5.201464
Train Epoch: 1 | Batch Status: 3200/16628 (19%) | Loss: 5.222496
Train Epoch: 1 | Batch Status: 3840/16628 (23%) | Loss: 5.175194
Train Epoch: 1 | Batch Status: 4480/16628 (27%) | Loss: 5.187172
Train Epoch: 1 | Batch Status: 5120/16628 (31%) | Loss: 5.164772
Train Epoch: 1 | Batch Status: 5760/16628 (35%) | Loss: 5.149550
Train Epoch: 1 | Batch Status: 6400/16628 (38%) | Loss: 5.123398
Train Epoch: 1 | Batch Status: 7040/16628 (42%) | Loss: 5.134135
Train Epoch: 1 | Batch Status: 7680/16628 (46%) | Loss: 5.060595
Train Epoch: 1 | Batch Status: 8320/16628 (50%) | Loss: 5.088549
Train Epoch: 1 | Batch Status: 8960/16628 (54%) | Loss: 5.039881
Train Epoch: 1 | Batc

Train Epoch: 5 | Batch Status: 8960/16628 (54%) | Loss: 2.692643
Train Epoch: 5 | Batch Status: 9600/16628 (58%) | Loss: 2.244007
Train Epoch: 5 | Batch Status: 10240/16628 (62%) | Loss: 2.875831
Train Epoch: 5 | Batch Status: 10880/16628 (65%) | Loss: 2.744791
Train Epoch: 5 | Batch Status: 11520/16628 (69%) | Loss: 2.337636
Train Epoch: 5 | Batch Status: 12160/16628 (73%) | Loss: 2.345799
Train Epoch: 5 | Batch Status: 12800/16628 (77%) | Loss: 2.502589
Train Epoch: 5 | Batch Status: 13440/16628 (81%) | Loss: 2.525365
Train Epoch: 5 | Batch Status: 14080/16628 (85%) | Loss: 2.473856
Train Epoch: 5 | Batch Status: 14720/16628 (88%) | Loss: 2.312082
Train Epoch: 5 | Batch Status: 15360/16628 (92%) | Loss: 2.818590
Train Epoch: 5 | Batch Status: 16000/16628 (96%) | Loss: 2.574138
Training time: 0m 1s
Test set: Average loss: 0.0388, Accuracy: 1767/4158 (42%)
Testing time: 0m 1s
Train Epoch: 6 | Batch Status: 0/16628 (0%) | Loss: 2.520869
Train Epoch: 6 | Batch Status: 640/16628 (4%) | Lo