In [9]:
import pandas as pd
import random
import numpy as np
from datetime import datetime, timedelta
import math
import gensim
from gensim.models import Word2Vec 
import time

In [10]:
def sample(mode):
    
    files = []
    
    if mode =='train':
        files = [
            '/scratch/by8jj/labels/merged_2019-01-10.csv',
             '/scratch/by8jj/labels/merged_2019-01-11.csv',
            '/scratch/by8jj/labels/merged_2019-01-12.csv',
             '/scratch/by8jj/labels/merged_2019-01-13.csv',
             '/scratch/by8jj/labels/merged_2019-01-14.csv',
             '/scratch/by8jj/labels/merged_2019-01-15.csv',
             '/scratch/by8jj/labels/merged_2019-01-16.csv',
             '/scratch/by8jj/labels/merged_2019-01-17.csv',
             '/scratch/by8jj/labels/merged_2019-01-18.csv',
             '/scratch/by8jj/labels/merged_2019-01-19.csv',
            ]
    else:
        
        files = [      
            '/scratch/by8jj/labels/merged_2019-01-20.csv',
            '/scratch/by8jj/labels/merged_2019-01-21.csv',
            '/scratch/by8jj/labels/merged_2019-01-22.csv',
            '/scratch/by8jj/labels/merged_2019-01-23.csv'
            ]
    
    #sample
    p = 0.01  # 1% of the lines
    df = pd.concat([pd.read_csv(x, header=0, skiprows=lambda i: i>0 and random.random() > p) for x in files], sort = False)
    
    #drop some columns
    drop_columns = ['src_ip', 'dest_ip', 'blacklist', 'honeypot', 'whitelist', 'src_ip_ext']
    df = df.drop(drop_columns, axis = 1)
    
    #remove non-numeric number in ts column
    df = df[pd.to_numeric(df['ts'], errors='coerce').notnull()] 
    df = df[pd.to_numeric(df['label'], errors='coerce').notnull()] 
    
    #convert label to numeric type
    df['label'] = [int(x) for x in df.label]
    
    
    #balance
    if mode == "train":
        df_benign = df.loc[df.label == 0]
        df_mal = df.loc[df.label == 1].sample(frac = 1)[:len(df_benign)]
        df_balanced = pd.concat([df_mal, df_benign])
        df = df_balanced
    
    #shuffle dataset
    df = df.sample(frac = 1)
    
    return df.reset_index(drop = True)
    

In [11]:
def transfer(df):
    
    
    #get dummy conn state
    df = pd.get_dummies(df, columns=['conn_state'], drop_first=True)
    
    #assumption 1: replace missing value by 0
    df = df.replace('-', 0)
    
    #assumption 2: convert ts into hour
    xhr, yhr = [], []

    for i in df.ts:
        try:
            temp = (datetime.fromtimestamp(int(float(i))) - timedelta(hours=-5)).strftime('%Y-%m-%d %H:%M:%S')[11:13]
            xhr.append(np.sin(2*math.pi*int(temp)/24))
            yhr.append(np.cos(2*math.pi*int(temp)/24))
        except:
            print(i)
            df = df.loc[df['ts'] != i]
            
    df['xhr'] = xhr
    df['yhr'] = yhr
    
    
    #assumption 3: convert port number into 3 categories
    df['dest_port'] = [0 if x in range(1024) else 1 if x in range(1024,4915) else 2 for x in df['dest_port']]
    df = pd.get_dummies(df, columns = ['dest_port'], drop_first = True)
    
    df['src_port'] = [0 if x in range(1024) else 1 if x in range(1024,4915) else 2 for x in df['src_port']]
    df = pd.get_dummies(df, columns = ['src_port'], drop_first = True)
    
    #assumption 4: log transformation
    df['duration'] = pd.to_numeric(df['duration'])
    df['src_bytes'] = pd.to_numeric(df['src_bytes'])
    df['dest_bytes'] = pd.to_numeric(df['dest_bytes'])
    df['src_pkts'] = pd.to_numeric(df['src_pkts'])
    df['dest_pkts'] = pd.to_numeric(df['dest_pkts'])
    df['duration'] = np.log(0.1 + df['duration'])
    df['src_bytes'] = np.log(0.1 + df['src_bytes'])
    df['dest_bytes'] = np.log(0.1 + df['dest_bytes'])
    df['src_pkts'] = np.log(0.1 + df['src_pkts'])
    df['dest_pkts'] = np.log(0.1 + df['dest_pkts'])
    
    #Word2Vec
    sentences = [str(x) for x in df.history.tolist()]
    sentences = [list(x) for x in sentences]
    model = Word2Vec(sentences, size = 5)
    words = list(model.wv.vocab)
    
    dic = {}

    for i in words:
        dic[i] = model[i]
    
    sentences = [sum([dic[i] for y in x]) for x in sentences]
    df = pd.concat([df, pd.DataFrame(data=sentences)], axis = 1)
    
    
    return df.drop(['ts','history'], axis = 1)

In [12]:
modes = ['test', 'train']

for j in range(6, 7):
    for i in modes:
        start_time = time.time()
        df = sample(i)
        df.to_csv(i + str(j) + 'original.csv', index = False)
        df = transfer(df)
        df.to_csv(i + str(j) + '.csv', index = False)
        print("--- %s seconds ---" % (time.time() - start_time))



--- 394.0628750324249 seconds ---


  exec(code_obj, self.user_global_ns, self.user_ns)


--- 873.6961815357208 seconds ---
