# <b>Datasets Characterization</b>

In [392]:
# Make mltoolbox and utls reachable from this folder
import sys
sys.path.append('../')

from mltoolbox.representation import iWord2Vec
import pandas as pd
import joblib

DEMO = True

## Task01 - Mobile Applications

In [47]:
dataset = pd.read_csv('../data/task01/raw_data/mirage.csv', index_col=[0])

dataset.shape

(44045, 234)

### Quantities

In [48]:
statistics = dataset[[c for c in dataset.columns if 'stats' in c]+['label']]
print(statistics.shape)
if not DEMO: 
    statistics.to_csv('../data/task01/features/statistics.csv')

sequences = dataset[[c for c in dataset.columns if 'seq' in c]+['label']]
print(sequences.shape)
if not DEMO: 
    sequences.to_csv('../data/task01/features/sequences.csv')

payload = dataset[[c for c in dataset.columns if 'byte' in c]+['label']]
print(payload.shape)
if not DEMO:
    payload.to_csv('../data/task01/features/payload.csv')

(44045, 73)
(44045, 129)
(44045, 33)


### Entities

In [49]:
with open('../data/task01/raw_data/corpus.txt', 'r') as file:
    corpus = [x.split(',') for x in file.read().split('\n')]

# Initialize the model
word2vec = iWord2Vec(c=25, e=64, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Retrieve the embeddings after the first training
embeddings = word2vec.get_embeddings()

embeddings = embeddings.reindex(dataset['s_ip']).set_index(dataset.index)
embeddings['label'] = dataset.label

if not DEMO:
    embeddings.to_csv('../data/task01/features/ipaddress.csv')

print(embeddings.shape) # Get the vocabulary size and the embeddings size
embeddings.head(3)

(44045, 65)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
air.com.hypah.io.slither_00_00,-0.007959,0.155493,0.199439,-0.003498,0.050787,-0.047075,-0.004645,-0.045098,-0.017566,0.046705,...,-0.045538,-0.063104,-0.120783,-0.024931,-0.074216,0.004015,0.047665,0.008133,-0.07153,air.com.hypah.io.slither
air.com.hypah.io.slither_00_01,-0.214694,0.535823,0.770899,-0.120575,-0.326864,-0.061792,-0.296117,0.182108,-0.205998,0.211606,...,-0.418786,-0.241119,-0.648396,0.07223,-0.723647,-0.171786,0.185587,0.026165,0.029002,air.com.hypah.io.slither
air.com.hypah.io.slither_00_02,-0.214694,0.535823,0.770899,-0.120575,-0.326864,-0.061792,-0.296117,0.182108,-0.205998,0.211606,...,-0.418786,-0.241119,-0.648396,0.07223,-0.723647,-0.171786,0.185587,0.026165,0.029002,air.com.hypah.io.slither


### Stratified k-fold

In [50]:
# Load stratified k folds
kfolds = joblib.load(f'../data/task01/skfolds/folds.save')

len(kfolds)

5

## Task02 - Darknet IP Addresses

In [393]:
statistics = pd.read_csv('../data/task02/features/statistics.csv', index_col=[0])
statistics.shape

(10460, 46)

In [394]:
import json
from tqdm.notebook import tqdm

with open('../data/task02/raw_data/corpus_ips.json', 'r') as file:
    _corpus = json.loads(file.read())

keys = sorted(_corpus.keys())

corpus = [x.split(',') for x in _corpus[keys[0]].split('\n')]

# Initialize a progress bar with a total of 5 iterations (skf)
pbar = tqdm(total=31)
pbar.set_description(f'Training iWord2Vec on 31 days')

# Initialize the model
word2vec = iWord2Vec(c=5, e=200, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Update the progress bar object and set the postfix message
pbar.update(1)
for key in keys[1:]:
    corpus = [x.split(',') for x in _corpus[key].split('\n')]
    # Update the pre-trained model on the current day
    word2vec.update(corpus)
    # Update the progress bar object and set the postfix message
    pbar.update(1)
# Close the progressbar
pbar.close()
# Retrieve the final updated embeddings
embeddings = word2vec.get_embeddings()
embeddings = embeddings.reindex(statistics.index)
embeddings['label'] = statistics.label
if not DEMO:
    embeddings.to_csv('../data/task02/features/ipaddress.csv')

  0%|          | 0/31 [00:00<?, ?it/s]

In [395]:
import json
from tqdm.notebook import tqdm

with open('../data/task02/raw_data/corpus_ports.json', 'r') as file:
    _corpus = json.loads(file.read())

keys = sorted(_corpus.keys())

corpus = [x.split(',') for x in _corpus[keys[0]].split('\n')]

# Initialize a progress bar with a total of 5 iterations (skf)
pbar = tqdm(total=31)
pbar.set_description(f'Training iWord2Vec on 31 days')

# Initialize the model
word2vec = iWord2Vec(c=5, e=128, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Update the progress bar object and set the postfix message
pbar.update(1)
for key in keys[1:]:
    corpus = [x.split(',') for x in _corpus[key].split('\n')]
    # Update the pre-trained model on the current day
    word2vec.update(corpus)
    # Update the progress bar object and set the postfix message
    pbar.update(1)
# Close the progressbar
pbar.close()
# Retrieve the final updated embeddings
p_embeddings = word2vec.get_embeddings()

if not DEMO:
    p_embeddings.to_csv('../data/task02/features/ports_w2v.csv')

  0%|          | 0/31 [00:00<?, ?it/s]

In [403]:
lookup = pd.read_csv('../data/task02/raw_data/ip_port_lookup.csv', index_col=[0])
grouped = lookup.groupby('src_ip').agg({'dst_port':list, 'freq':list})

ports_embeddings = []
for ip in grouped.index:
    entry = grouped.loc[ip].dst_port
    p_weights = grouped.loc[ip].freq
    p_weights = np.asarray(p_weights).reshape(-1, 1)
    p_emb = p_embeddings.loc[[str(x) for x in entry]]
    a = (p_emb.values* p_weights).sum(0).reshape(1, -1)
    avg_embedding = np.ravel(a/len(entry))
    ports_embeddings.append(([ip]+list(avg_embedding)))
ports_embeddings = pd.DataFrame(ports_embeddings).rename(columns={0:'index'}).set_index('index').reindex(statistics.index)
ports_embeddings['label'] = statistics.label

if not DEMO:
    ports_embeddings.to_csv('../data/task02/features/ports.csv')

### Stratified k-fold **REDO**

In [399]:
# Load stratified k folds
kfolds = joblib.load(f'../data/task02/skfolds/folds.save')

len(kfolds)

5

## Task03 - Traffic Categories

In [51]:
dataset = pd.read_csv('../data/task03/raw_data/iscxvpn2016.csv', index_col=[0])

dataset.shape

(609, 234)

### Quantities

In [53]:
statistics = dataset[[c for c in dataset.columns if 'stats' in c]+['label']]
print(statistics.shape)
if not DEMO: 
    statistics.to_csv('../data/task03/features/statistics.csv')

sequences = dataset[[c for c in dataset.columns if 'seq' in c]+['label']]
print(sequences.shape)
if not DEMO: 
    sequences.to_csv('../data/task03/features/sequences.csv')

payload = dataset[[c for c in dataset.columns if 'byte' in c]+['label']]
print(payload.shape)
if not DEMO:
    payload.to_csv('../data/task03/features/payload.csv')

(609, 73)
(609, 129)
(609, 33)


### Entities

In [54]:
with open('../data/task03/raw_data/corpus.txt', 'r') as file:
    corpus = [x.split(',') for x in file.read().split('\n')]

# Initialize the model
word2vec = iWord2Vec(c=25, e=64, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Retrieve the embeddings after the first training
embeddings = word2vec.get_embeddings()

embeddings = embeddings.reindex(dataset['s_ip']).set_index(dataset.index)
embeddings['label'] = dataset.label

if not DEMO:
    embeddings.to_csv('../data/task03/features/ipaddress.csv')

print(embeddings.shape) # Get the vocabulary size and the embeddings size
embeddings.head(3)

(609, 65)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
facebook_video1a_00,0.011131,-0.003477,0.016922,0.000917,0.00242,-0.001959,0.008336,-0.01457,-0.016698,0.002706,...,0.005312,-0.000364,0.012002,-0.013135,0.009491,0.006722,-0.001974,-0.001108,-0.006213,voip
facebook_video1a_01,0.011131,-0.003477,0.016922,0.000917,0.00242,-0.001959,0.008336,-0.01457,-0.016698,0.002706,...,0.005312,-0.000364,0.012002,-0.013135,0.009491,0.006722,-0.001974,-0.001108,-0.006213,voip
facebook_video1b_02,0.00912,-0.013391,-0.007605,-0.008153,0.00916,-0.004614,0.004,-0.01052,-0.009713,-0.007819,...,0.007337,0.007162,0.010249,-0.01133,0.015718,0.000573,-0.001427,-0.005463,-0.009623,voip


### Stratified k-fold

In [55]:
# Load stratified k folds
kfolds = joblib.load(f'../data/task03/skfolds/folds.save')

len(kfolds)

5