In [2]:
import pandas as pd
import numpy as np
import glob
import os
from matplotlib import pyplot as plt

%matplotlib inline
plt.rcParams["figure.figsize"] = (20,3)

# Rassembler les données

Les données founies sont beaucoup trop volumineuses.

Ainsi, nous allons regrouper un enssemble de données permettant une detection d'anomalies efficace en se basant uniquement sur un plus petit set de données.

Notre méthodologie est la suivante:

    1) Récuperer un ensemble de requettes anormales
    2) Récuperer un ensemble de requettes non anormales proportionnel à ce qui à été pris précedemment

Nous souhaitons 100 Mo max de données afin que les différentes platformes puissent les utiliser simplement.
En prenant un fichier au hasard, nous savons que 5.100.000 requettes utilisent environ 438 Mo.

In [3]:
# valeure à changer si nous voulons plus ou moins de données
mo_voulues = 100

In [4]:
nb_tot_requettes = 5100000 * mo_voulues / 438
nb_tot_requettes

1164383.5616438356

Il nous faudrait ainsi environ 1.164.000 requettes.

Nous souhaitons 30% d'anomalie

In [5]:
nb_tot_requettes_anomalies = nb_tot_requettes * 0.3
nb_tot_requettes_anomalies

349315.0684931507

Ainsi, il nous faut environ 350.000 anomalies

In [6]:
nb_requettes_par_anomalies = nb_tot_requettes_anomalies / 7
nb_requettes_par_anomalies

49902.152641878674

Vu qu il y a 7 types d'anomalies, nous essayons d'en avoir 50.000 de chaque max.

In [7]:
# header des données
data_header = ['date_time', 'duration', 'ip_s', 'ip_d', 'port_s', 'port_d', 'prot', 'flag', 'fwd', 'tos', 'nb_packets', 'nb_bytes', 'data_type']

## Récuperer un ensemble de requettes anormales

In [8]:
path = r'data/anomalies/'

files_cat = {}

for file in os.listdir(path):
    cat = file.split('_')[0]
    if cat not in files_cat:
        files_cat[cat] = []
    files_cat[cat].append(file)
files_cat

{'botnet': ['botnet_august_week1.csv',
  'botnet_august_week2.csv',
  'botnet_july_week5.csv'],
 'dos': ['dos_august_week1_small.csv',
  'dos_august_week2.csv',
  'dos_july_week5_small.csv'],
 'scan11': ['scan11_august_week1.csv',
  'scan11_august_week2.csv',
  'scan11_july_week5.csv'],
 'scan44': ['scan44_august_week1_small.csv',
  'scan44_august_week2.csv',
  'scan44_july_week5_small.csv'],
 'spam': ['spam_april_week2.csv',
  'spam_april_week3_small.csv',
  'spam_april_week4.csv',
  'spam_april_week5.csv',
  'spam_august_week1_small.csv',
  'spam_august_week2_small.csv',
  'spam_august_week3_small.csv',
  'spam_august_week4_small.csv',
  'spam_july_week5.csv',
  'spam_june_week1.csv',
  'spam_june_week2.csv',
  'spam_june_week3.csv',
  'spam_june_week4_small.csv',
  'spam_may_week2.csv',
  'spam_may_week3.csv',
  'spam_may_week4_small.csv',
  'spam_may_week5.csv',
  'spam_may_week6.csv'],
 'sshscan': ['sshscan_april_week2_small.csv',
  'sshscan_april_week3_small.csv',
  'sshscan_apri

In [9]:
len(files_cat)
# il y a bien 7 categories

7

Nous voulons avoir des données les plus variées en limitant au mieux les trop grande redondance d'informations

In [10]:
res_ano = None

for cat, cat_files in files_cat.items():
    nb_requettes = nb_requettes_par_anomalies
    nb_files = len(cat_files)
    
    cat_df = []
    cat_df_to_min = []
    
    for file in cat_files:
        df = pd.read_csv(path + file, parse_dates=[0], index_col=[0], names=data_header, nrows=1000000)
        
        if len(df) <= nb_requettes / nb_files:
            cat_df.append(df)
            nb_requettes -= len(df)
            nb_files -= 1
        else:
            cat_df_to_min.append(df.sample(n = int(nb_requettes // nb_files)))
    
    change = True
    while change and nb_files:
        change = False
        for df in cat_df_to_min:
            if len(df) <= nb_requettes / nb_files:
                cat_df.append(df)
                nb_requettes -= len(df)
                nb_files -= 1
                change = True
    if nb_files:
        for df in cat_df_to_min:
            cat_df.append(df.sample(n = int(nb_requettes // nb_files)))
            
    cat_df.append(res_ano)
    res_ano = pd.concat(cat_df)

In [11]:
res_ano

Unnamed: 0_level_0,duration,ip_s,ip_d,port_s,port_d,prot,flag,fwd,tos,nb_packets,nb_bytes,data_type
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-08-01 04:14:09,0.000,217.156.59.213,42.219.147.185,5068,5057,UDP,.A....,0,40.0,1,432,anomaly-udpscan
2016-08-01 04:10:48,0.000,217.156.59.213,42.219.148.78,5062,6003,UDP,.A....,0,40.0,1,435,anomaly-udpscan
2016-08-01 04:14:10,0.000,217.156.59.213,42.219.146.56,5068,5038,UDP,.A....,0,40.0,1,435,anomaly-udpscan
2016-08-01 04:12:52,0.000,217.156.59.213,42.219.155.112,5066,7055,UDP,.A....,0,40.0,1,432,anomaly-udpscan
2016-08-01 04:12:50,0.000,217.156.59.213,42.219.152.28,5066,7035,UDP,.A....,0,40.0,1,435,anomaly-udpscan
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-07-31 07:09:31,9.013,42.219.152.21,205.188.59.194,4103,25,TCP,....S.,0,0.0,6,288,nerisbotnet
2016-07-28 01:55:37,0.000,143.72.8.137,42.219.158.17,53,44923,UDP,.A....,0,0.0,1,86,nerisbotnet
2016-07-31 07:36:07,9.013,42.219.156.29,72.14.213.27,2541,25,TCP,....S.,0,0.0,6,288,nerisbotnet
2016-07-28 01:20:06,0.000,143.72.8.137,42.219.156.27,53,59297,UDP,.A....,0,0.0,1,86,nerisbotnet


In [12]:
# on enregistre le résultat
#res_ano.reset_index().to_csv("data/anomalies.csv", index=False, header=None)

## Récuperer un ensemble de requettes non anormales

In [13]:
nb_tot_requettes_normales = nb_tot_requettes_anomalies * 0.7 / 0.3
nb_tot_requettes_normales

815068.493150685

Si nous voulons 70% de données normales, nous prenons environ 815000 requettes.

Nous prenons les données de la semaine 1 de aout.

D'apres nos calculs:  
~840 millions de lignes dans la week 1  
~120 millions de lignes par jours  
~5 millions de lignes par heures

Afin de diversifier nos données, nous allons prélever des parties de 100 000 requettes à travers notre fichier.

Pour executer la suite, il faut au préalable avoir découper notre fichier en plusieures plus petit.
Nous avons utilisé 7zip pour découper notre fichier de 80Go en fichier de 100Mo

In [20]:
path = r'D:/to_delete/week_1/' # use your path
all_files = glob.glob(path + "/august.week1.zip*")

subdatas = []

for file in all_files:
    print(file)
    df = pd.read_csv(file, skiprows=2, parse_dates=[0], index_col=[0], names=data_header, nrows=100000, encoding_errors = 'ignore')
    subdatas.append(df[df.data_type == "background"])
    
subdata = pd.concat(subdatas)

D:/to_delete/week_1\august.week1.zip.001
D:/to_delete/week_1\august.week1.zip.002
D:/to_delete/week_1\august.week1.zip.003
D:/to_delete/week_1\august.week1.zip.004
D:/to_delete/week_1\august.week1.zip.005
D:/to_delete/week_1\august.week1.zip.006
D:/to_delete/week_1\august.week1.zip.007
D:/to_delete/week_1\august.week1.zip.008
D:/to_delete/week_1\august.week1.zip.009
D:/to_delete/week_1\august.week1.zip.010
D:/to_delete/week_1\august.week1.zip.011
D:/to_delete/week_1\august.week1.zip.012
D:/to_delete/week_1\august.week1.zip.013
D:/to_delete/week_1\august.week1.zip.014
D:/to_delete/week_1\august.week1.zip.015
D:/to_delete/week_1\august.week1.zip.016
D:/to_delete/week_1\august.week1.zip.017
D:/to_delete/week_1\august.week1.zip.018
D:/to_delete/week_1\august.week1.zip.019
D:/to_delete/week_1\august.week1.zip.020
D:/to_delete/week_1\august.week1.zip.021
D:/to_delete/week_1\august.week1.zip.022
D:/to_delete/week_1\august.week1.zip.023
D:/to_delete/week_1\august.week1.zip.024
D:/to_delete/wee

D:/to_delete/week_1\august.week1.zip.201
D:/to_delete/week_1\august.week1.zip.202
D:/to_delete/week_1\august.week1.zip.203
D:/to_delete/week_1\august.week1.zip.204
D:/to_delete/week_1\august.week1.zip.205
D:/to_delete/week_1\august.week1.zip.206
D:/to_delete/week_1\august.week1.zip.207
D:/to_delete/week_1\august.week1.zip.208
D:/to_delete/week_1\august.week1.zip.209
D:/to_delete/week_1\august.week1.zip.210
D:/to_delete/week_1\august.week1.zip.211
D:/to_delete/week_1\august.week1.zip.212
D:/to_delete/week_1\august.week1.zip.213
D:/to_delete/week_1\august.week1.zip.214
D:/to_delete/week_1\august.week1.zip.215
D:/to_delete/week_1\august.week1.zip.216
D:/to_delete/week_1\august.week1.zip.217
D:/to_delete/week_1\august.week1.zip.218
D:/to_delete/week_1\august.week1.zip.219
D:/to_delete/week_1\august.week1.zip.220
D:/to_delete/week_1\august.week1.zip.221
D:/to_delete/week_1\august.week1.zip.222
D:/to_delete/week_1\august.week1.zip.223
D:/to_delete/week_1\august.week1.zip.224
D:/to_delete/wee

D:/to_delete/week_1\august.week1.zip.401
D:/to_delete/week_1\august.week1.zip.402
D:/to_delete/week_1\august.week1.zip.403
D:/to_delete/week_1\august.week1.zip.404
D:/to_delete/week_1\august.week1.zip.405
D:/to_delete/week_1\august.week1.zip.406
D:/to_delete/week_1\august.week1.zip.407
D:/to_delete/week_1\august.week1.zip.408
D:/to_delete/week_1\august.week1.zip.409
D:/to_delete/week_1\august.week1.zip.410
D:/to_delete/week_1\august.week1.zip.411
D:/to_delete/week_1\august.week1.zip.412
D:/to_delete/week_1\august.week1.zip.413
D:/to_delete/week_1\august.week1.zip.414
D:/to_delete/week_1\august.week1.zip.415
D:/to_delete/week_1\august.week1.zip.416
D:/to_delete/week_1\august.week1.zip.417
D:/to_delete/week_1\august.week1.zip.418
D:/to_delete/week_1\august.week1.zip.419
D:/to_delete/week_1\august.week1.zip.420
D:/to_delete/week_1\august.week1.zip.421
D:/to_delete/week_1\august.week1.zip.422
D:/to_delete/week_1\august.week1.zip.423
D:/to_delete/week_1\august.week1.zip.424
D:/to_delete/wee

D:/to_delete/week_1\august.week1.zip.601
D:/to_delete/week_1\august.week1.zip.602
D:/to_delete/week_1\august.week1.zip.603
D:/to_delete/week_1\august.week1.zip.604
D:/to_delete/week_1\august.week1.zip.605
D:/to_delete/week_1\august.week1.zip.606
D:/to_delete/week_1\august.week1.zip.607
D:/to_delete/week_1\august.week1.zip.608
D:/to_delete/week_1\august.week1.zip.609
D:/to_delete/week_1\august.week1.zip.610
D:/to_delete/week_1\august.week1.zip.611
D:/to_delete/week_1\august.week1.zip.612
D:/to_delete/week_1\august.week1.zip.613
D:/to_delete/week_1\august.week1.zip.614
D:/to_delete/week_1\august.week1.zip.615
D:/to_delete/week_1\august.week1.zip.616
D:/to_delete/week_1\august.week1.zip.617
D:/to_delete/week_1\august.week1.zip.618
D:/to_delete/week_1\august.week1.zip.619
D:/to_delete/week_1\august.week1.zip.620
D:/to_delete/week_1\august.week1.zip.621
D:/to_delete/week_1\august.week1.zip.622
D:/to_delete/week_1\august.week1.zip.623
D:/to_delete/week_1\august.week1.zip.624
D:/to_delete/wee

In [21]:
subdata

Unnamed: 0_level_0,duration,ip_s,ip_d,port_s,port_d,prot,flag,fwd,tos,nb_packets,nb_bytes,data_type
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-08-01 00:10:09,0.052,42.219.157.8,65.247.77.241,56097,443,TCP,.AP.S.,0,0,3,190,background
2016-08-01 00:10:09,0.052,42.219.157.8,65.247.77.242,56096,443,TCP,.AP.S.,0,0,3,190,background
2016-08-01 00:10:09,0.256,42.219.156.211,133.133.140.122,443,58676,TCP,.AP.S.,0,0,6,5298,background
2016-08-01 00:10:09,0.280,42.219.159.92,187.41.229.167,55674,80,TCP,.AP.S.,0,0,5,439,background
2016-08-01 00:10:09,0.828,42.219.159.85,43.164.44.49,42068,443,TCP,.AP.S.,0,0,12,1829,background
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08-07 23:45:27,0.000,70.175.188.109,42.219.154.107,1099,80,TCP,.A...F,0,0,2,80,background
2016-08-07 23:45:27,0.000,74.170.141.211,42.219.153.128,59693,80,TCP,.A.R..,0,72,1,40,background
2016-08-07 23:45:27,0.000,74.172.110.134,42.219.153.128,59682,80,TCP,.A.R..,0,72,1,40,background
2016-08-07 23:45:27,0.000,74.175.30.240,42.219.153.128,59678,80,TCP,.A.R..,0,72,1,40,background


In [22]:
# on enregistre le résultat
subdata.reset_index().to_csv("D:/to_delete/week_1/august_week1_small.csv", index=False, header=None)

In [23]:
subdata.prot.value_counts()

TCP      54033493
UDP      19432985
ICMP       909408
GRE         63197
ESP         27043
IPIP         6753
IPv6         2823
246             4
RVD             1
ISIS4           1
160             1
156             1
Name: prot, dtype: int64

Nous pouvons voir des protocols peux communs. Ce ne sont pas des erreurs.

In [24]:
subdata.data_type.value_counts()

background    74475710
Name: data_type, dtype: int64

In [39]:
#basic_protocols = ['TCP', 'UDP', 'ICMP', 'GRE', 'ESP', 'IPIP', 'IPv6']

In [25]:
prot_infos = subdata.prot.value_counts()
prot_infos = list(zip(prot_infos, prot_infos.index))
prot_infos.reverse()
prot_infos

[(1, '156'),
 (1, '160'),
 (1, 'ISIS4'),
 (1, 'RVD'),
 (4, '246'),
 (2823, 'IPv6'),
 (6753, 'IPIP'),
 (27043, 'ESP'),
 (63197, 'GRE'),
 (909408, 'ICMP'),
 (19432985, 'UDP'),
 (54033493, 'TCP')]

In [26]:
sub_res_normal = []

nb_prot = len(prot_infos)
nb_requests = nb_tot_requettes_normales

for nb, prot in prot_infos:
    print(prot)
    if nb <= nb_requests / nb_prot:
        sub_res_normal.append(subdata[subdata.prot == prot])
    else:
        sub_res_normal.append(subdata[subdata.prot == prot].sample(n = int(nb_requests // nb_prot)))
    nb_prot -= 1
    nb_requests -= len(sub_res_normal[-1])

res_normal = pd.concat(sub_res_normal)

156
160
ISIS4
RVD
246
IPv6
IPIP
ESP
GRE
ICMP
UDP
TCP


In [27]:
res_normal

Unnamed: 0_level_0,duration,ip_s,ip_d,port_s,port_d,prot,flag,fwd,tos,nb_packets,nb_bytes,data_type
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-08-07 10:05:50,0.000,87.168.122.66,42.219.144.247,0,0,156,.A....,0,1,1,1297,background
2016-08-06 18:15:11,0.000,87.168.122.66,42.219.148.67,0,0,160,.A....,0,0,1,269,background
2016-08-05 23:18:53,0.000,87.168.122.66,42.219.144.188,0,0,ISIS4,.A....,0,1,1,1119,background
2016-08-04 03:33:57,0.000,250.51.24.126,42.219.146.141,0,0,RVD,.A....,0,2,1,26,background
2016-08-05 16:00:55,0.000,87.168.122.66,42.219.147.28,0,0,246,.A....,0,1,1,82,background
...,...,...,...,...,...,...,...,...,...,...,...,...
2016-08-02 02:22:30,0.084,201.171.238.25,42.219.156.211,63852,80,TCP,.AP.SF,0,0,5,634,background
2016-08-04 16:07:58,2.584,194.233.94.148,42.219.158.186,63758,80,TCP,.AP.SF,0,0,33,2693,background
2016-08-04 02:12:15,0.032,42.219.156.211,165.131.216.197,80,53905,TCP,.AP.SF,0,0,4,888,background
2016-08-02 15:13:58,0.316,145.186.9.79,42.219.159.92,443,49479,TCP,.AP.S.,0,0,10,4664,background


In [31]:
res_normal.data_type.value_counts()

background    815068
Name: data_type, dtype: int64

In [28]:
# on enregistre le résultat
res_normal.reset_index().to_csv("data/normal.csv", index=False, header=None)

In [29]:
res = pd.concat([pd.read_csv("data/anomalies.csv", parse_dates=[0], index_col=[0], names=data_header), res_normal])

In [30]:
res.reset_index().to_csv("data/data_base.csv", index=False, header=None)