# <b>Datasets Characterization</b>

In [90]:
from mltoolbox.representation import iWord2Vec
import pandas as pd
import joblib

DEMO = True

## Task01 - Mobile Applications

In [92]:
dataset = pd.read_csv('../data/task01/raw_data/mirage.csv', index_col=[0])

dataset.shape

(44045, 234)

### Quantities

In [94]:
statistics = dataset[[c for c in dataset.columns if 'stats' in c]+['label']]
print(statistics.shape)
if not DEMO: 
    statistics.to_csv('../data/task01/features/statistics.csv')

sequences = dataset[[c for c in dataset.columns if 'seq' in c]+['label']]
print(sequences.shape)
if not DEMO: 
    sequences.to_csv('../data/task01/features/sequences.csv')

payload = dataset[[c for c in dataset.columns if 'byte' in c]+['label']]
print(payload.shape)
if not DEMO:
    payload.to_csv('../data/task01/features/payload.csv')

(44045, 73)
(44045, 129)
(44045, 33)


### Entities

In [95]:
with open('../data/task01/raw_data/corpus.txt', 'r') as file:
    corpus = [x.split(',') for x in file.read().split('\n')]

# Initialize the model
word2vec = iWord2Vec(c=25, e=64, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Retrieve the embeddings after the first training
embeddings = word2vec.get_embeddings()

embeddings = embeddings.reindex(dataset['s_ip']).set_index(dataset.index)
embeddings['label'] = dataset.label

if not DEMO:
    embeddings.to_csv('../data/task01/features/ipaddress.csv')

print(embeddings.shape) # Get the vocabulary size and the embeddings size
embeddings.head(3)

(44045, 65)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
air.com.hypah.io.slither_00_00,-0.002527,0.13287,0.174684,0.002305,0.043427,-0.034232,0.008436,-0.036266,0.006017,0.043936,...,-0.028833,-0.05562,-0.133201,-0.012244,-0.063352,0.019017,0.054084,0.020851,-0.069401,air.com.hypah.io.slither
air.com.hypah.io.slither_00_01,-0.164186,0.568958,0.493986,-0.276781,-0.193574,0.094991,-0.146859,0.035338,-0.061602,0.155917,...,-0.549236,-0.292464,-0.704331,0.215376,-0.673366,-0.28165,0.1709,0.088226,-0.105547,air.com.hypah.io.slither
air.com.hypah.io.slither_00_02,-0.164186,0.568958,0.493986,-0.276781,-0.193574,0.094991,-0.146859,0.035338,-0.061602,0.155917,...,-0.549236,-0.292464,-0.704331,0.215376,-0.673366,-0.28165,0.1709,0.088226,-0.105547,air.com.hypah.io.slither


### Stratified k-fold

In [96]:
# Load stratified k folds
kfolds = joblib.load(f'../data/task01/skfolds/folds.save')

len(kfolds)

5

## Task02 - Darknet IP Addresses

### Quantities

In [100]:
dataset = pd.read_csv('../data/task02/raw_data/darknet.csv', index_col=[0])

print(dataset.shape)
dataset.head()

(14086, 69)


Unnamed: 0_level_0,dst_cnt,dst_max,dst_avg,dst_std,ttl_min,ttl_max,ttl_avg,ttl_std,serv_cnt,serv_max,...,t_ts_std,t_sack_avg,t_sack_min,t_sack_max,t_sack_std,t_sackp_avg,t_sackp_min,t_sackp_max,t_sackp_std,label
src_ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0.244.253,5.0,15.0,4.6,5.23832,47.0,47.0,47.0,0.0,1.0,23.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,mirai
1.116.131.247,21.0,1.0,1.0,0.0,34.0,37.0,35.095238,0.971242,10.0,4.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,unknown
1.116.164.86,10.0,1.0,1.0,0.0,33.0,35.0,33.9,0.7,1.0,10.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,mirai
1.116.203.71,253.0,1.0,1.0,0.0,34.0,37.0,35.237154,1.040239,1.0,253.0,...,20.520559,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,0.0,unknown
1.116.217.186,30.0,2.0,1.033333,0.179505,34.0,37.0,35.064516,0.981621,14.0,5.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,unknown


### Entities

In [None]:
import json

with open('../data/task02/raw_data/corpus_ip.json', 'r') as file:
    _corpus = json.loads(file.read())

keys = sorted(_corpus.keys())

corpus = [x.split(',') for x in _corpus[keys[0]].split('\n')]
# Initialize the model
word2vec = iWord2Vec(c=25, e=200, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
for key in keys[1:]:
    corpus = [x.split(',') for x in _corpus[key].split('\n')]
    # Train the initialized model
    word2vec.update(corpus)
# Retrieve the embeddings after the first training
embeddings = word2vec.get_embeddings()

In [122]:
embeddings.reindex(dataset.index)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
src_ip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0.244.253,0.118960,-0.102538,0.153177,0.048123,0.027851,0.083237,-0.161051,0.150832,-0.161052,0.052105,...,-0.040157,-0.062942,-0.038205,0.016031,0.174177,-0.108588,-0.142663,0.163852,0.123707,0.130229
1.116.131.247,0.032446,-0.373912,-0.146583,0.305255,0.320188,0.208286,-0.076223,-0.052885,-0.270263,0.150392,...,-0.193825,-0.515693,0.170949,0.098432,-0.001409,-0.086065,-0.116484,0.088673,-0.044186,-0.044247
1.116.164.86,0.051123,-0.152088,0.244004,0.106238,0.011761,0.126936,-0.207957,0.184849,-0.170466,0.131702,...,-0.015703,-0.105154,-0.007704,-0.001784,0.214938,-0.085586,-0.140290,0.170382,0.189659,0.169698
1.116.203.71,0.011009,-0.029017,0.028181,0.011351,0.005562,0.005034,-0.009070,0.001905,-0.053785,0.017328,...,-0.027822,-0.064755,0.007605,0.014162,0.016971,-0.030901,-0.042000,0.029804,0.016189,0.014959
1.116.217.186,0.066883,-0.253448,-0.078829,0.210696,0.292306,0.057724,0.016968,-0.112577,-0.267563,0.243974,...,-0.096316,-0.429244,0.230744,0.070382,0.149575,0.022356,-0.238733,-0.167264,0.119482,-0.162003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99.33.67.175,,,,,,,,,,,...,,,,,,,,,,
99.56.117.152,0.016340,-0.089347,0.308378,0.113517,0.038953,-0.006461,-0.061006,0.001627,-0.085339,0.089262,...,-0.035471,-0.027251,0.070337,0.038632,0.068306,-0.125370,-0.065075,0.300907,0.086903,0.225926
99.64.62.168,0.083043,-0.151879,0.259837,0.093275,0.047666,0.092633,-0.166278,0.068377,-0.148245,0.112918,...,-0.086935,-0.085360,-0.043734,0.072640,0.208129,-0.118990,-0.133582,0.230133,0.100200,0.166968
99.8.85.178,0.008030,-0.158740,0.308997,0.118572,0.056532,-0.002407,-0.088884,-0.035197,-0.086778,0.094151,...,-0.032698,-0.044586,0.099849,0.046040,0.055413,-0.117757,-0.067943,0.270574,0.077880,0.191888


In [None]:
with open('../data/task02/raw_data/corpus_ports.txt', 'r') as file:
    corpus = [x.split(',') for x in file.read().split('\n')]

# Initialize the model
word2vec = iWord2Vec(c=25, e=64, epochs=1, seed=15)
# Train the initialized model
word2vec.train(corpus)
# Retrieve the embeddings after the first training
embeddings = word2vec.get_embeddings()

embeddings = embeddings.reindex(dataset['s_ip']).set_index(dataset.index)
embeddings['label'] = dataset.label

if not DEMO:
    embeddings.to_csv('../data/task01/features/ports_embeddings.csv')

print(embeddings.shape) # Get the vocabulary size and the embeddings size
embeddings.head(3)

### Stratified k-fold

## Task03 - Traffic Categories

### Quantities

### Entities

### Stratified k-fold