In [44]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
import zipfile
import sys
import time

import pandas as pd
import numpy as np
from six.moves import urllib
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector


###### USER PARAMETERS ########
# input data file
input_filename = "./corpus_example.csv"
split_sign = ","
# path to store embeddings and models
save_embeddings = "corpus_embeddings"
min_oc = 5
embedding_size = 100 # Size of the embedding
num_epochs = float(1000)
###### END USER PARMETERS #####


###### Parameters for the Embedding ####
batch_size = 256     # Consider for each learning step X flows
num_sampled = 32
    # Number of training epochs
valid_size = 16     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
###### END PARAMETERS FOR THE EMBEDDING #####


####### FUNCTIONS ###############

# Reads the file with the preprocessed flow-based data and returns a list with all "words"
# The file must have the following structure:
#   srcip, context_attribute1, context_attribute_2, ...
def read_file_and_convert_to_list(input_filename):
  # Stores the number of attributes (target and context) for each flow
  global num_elems
  # Read the file line by line
  with open(input_filename, 'r') as f:
    data = tf.compat.as_str(f.read()).splitlines()
    print(data)
    num_elems = len(data[0].split(","))
  # Convert to list
  res = []
  for line in data:
      for word in line.split(","):
          res.append(word.strip())
  return res, len(data)


# This functions builds the data set (all values are converted to ids)
def build_dataset(words):
    tmp = []
    tmp.extend(collections.Counter(words).most_common())
    
    #tmp에 단어별 빈도 리스트 저장
    
    #size = tmp의 요소개수
    size = len(tmp)
    
    for until in range(len(tmp)):
        #d = tmp[n]
        #unique word
        d = tmp[until]
        
        #c = d[1] //단어의 빈도
        c = d[1]
        
        if c < min_oc:
            size = until+1+2
            break

     # Count the frequency of all values
    count = [['UNKOWN_IP',-1],['UNKOWN_dPt',-1]]
    count.extend(collections.Counter(words).most_common(size-2))

    # Build the dictionary
    # e.g. ("129.3.3.3" , 2)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    # Build the reverse dictionary
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))

    # Transfer the list of words to a list of IDs
    data = list()
    unk_count_ip = 0
    unk_count_dpt = 0
    for word in words:
        if word in dictionary:
            index = dictionary[word]
        else:
            if len(word.split('.')) == 4:
                index = 0
                unk_count_ip = unk_count_ip + 1
            else:
                index = 1
                unk_count_dpt = unk_count_dpt + 1
        data.append(index)

    count[0][1] = unk_count_ip
    count[1][1] = unk_count_dpt

    return data, count, dictionary, reverse_dictionary, size



# Generates the next batch
def generate_batch():
    # use the global data_index variable (recent pointer)
    global data_index
    # create variables for the training sets
    batch = np.ndarray(shape=(training_pairs),dtype=np.int32)
    labels= np.ndarray(shape=(training_pairs,1), dtype=np.int32)

    # Read batch_size flows and create training sets
    for i in range(batch_size):
        for j in range(1,num_elems):
            batch[i*pairs+j-1] = data[data_index]
            labels[i*pairs+j-1,0] = data[data_index+j]

        batch[i*pairs+(num_elems)-1] = data[data_index+2]
        labels[i*pairs+(num_elems)-1,0] = data[data_index+1]

        batch[i*pairs+(num_elems)] = data[data_index+3]
        labels[i*pairs+(num_elems),0] = data[data_index+1]
        # Check if end of training list is reached
        data_index = (data_index + num_elems) % len_value

    return batch, labels

######## END FUNCTIONS #######

#### Global variales
# Recent pointer for data
data_index = 0
# Number of attributes per line
num_elems = -1
#### End Global variales
loss_list = []

#### STEP 1: Read the file
print("Step 1: Read the file")
input_values, num_lines = read_file_and_convert_to_list(input_filename)
len_value = len(input_values)
# define the number of of extractable pairs per flow
pairs = num_elems - 1 + 2
# The number of training pers created per flow
training_pairs = pairs * batch_size


#### Step 2: Build the data set
print("Step 2: Build the data set")
data, count, dictionary, reverse_dictionary, vocabluary_size = build_dataset(input_values)
print("Voc: --- ", len(dictionary))
print("VOC: ", vocabluary_size)
generate_batch()


#### Step 3: Build the model
print("Step 3: Build the model")

graph = tf.Graph()

with graph.as_default():
    # Input data
    train_inputs = tf.placeholder(tf.int32,shape=[training_pairs])
    train_labels = tf.placeholder(tf.int32,shape=[training_pairs,1])
    valid_dataset= tf.constant(valid_examples, dtype=tf.int32)

    with tf.device('/gpu:0'):
        # Look up embedding for inputs
        embeddings = tf.Variable(tf.random_uniform([vocabluary_size,embedding_size],-1.0,1.0))
        embed = tf.nn.embedding_lookup(embeddings,train_inputs)

        # Construct the variables for NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabluary_size,embedding_size],stddev=1.0 / math.sqrt(embedding_size)) )
        nce_biases = tf.Variable(tf.zeros([vocabluary_size]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabluary_size))

        # Construct the SGD optimizer using a learning rate of 1.0
        optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

        # compute
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm

        # init
        init = tf.global_variables_initializer()


# Step 4: Train the model
print("Step 4: Train the model")

# calculate the number of training steps
num_steps = int(num_lines / batch_size * num_epochs)
config = tf.ConfigProto(allow_soft_placement = True)

sess = tf.Session(graph=graph, config=config)
with tf.Session(graph=graph, config=config) as session:
    # Init
    init.run()
    print("Step 4.1: Initialized")

    average_loss = 0
    for step in xrange(num_steps):
        batch_inputs, batch_labels = generate_batch()
        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
        average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            print("Average loss at step", step, ": ", average_loss, " from ", num_steps ," steps.")
            average_loss = 0
        percentage = (step*100)/num_steps
        print("Percentage: ",percentage," %")

        loss_list.append(loss_val)
    # Save the embedding
    to_save = session.run(normalized_embeddings)
    target = save_embeddings
    dataframe = pd.DataFrame(data=to_save[0:,0:])
    vals = []
    for u in range(0,len(to_save)):
        vals.append(reverse_dictionary.get(u))
    dataframe['values'] = vals
    dataframe.to_csv(target,sep=",")
    fobi = pd.DataFrame(loss_list)
    fobi.to_csv(save_embeddings)
    
    
    








Step 1: Read the file
['Proto,SrcAddr,DstAddr,Dport', 'tcp,93.45.239.29,147.32.84.118,6881', 'tcp,62.240.166.118,147.32.84.229,13363', 'tcp,147.32.86.148,66.235.132.232,80', 'tcp,147.32.3.51,147.32.84.46,10010', 'tcp,88.212.37.169,147.32.84.118,6881', 'tcp,94.44.60.103,147.32.84.118,6881', 'tcp,2.159.127.100,147.32.84.118,6881', 'tcp,213.233.154.219,147.32.84.229,13363', 'tcp,88.212.37.169,147.32.84.118,6881', 'tcp,95.210.161.212,147.32.84.118,6881', 'tcp,94.44.60.103,147.32.84.118,6881', 'tcp,85.132.162.9,147.32.84.118,6881', 'tcp,88.212.37.169,147.32.84.118,6881', 'tcp,140.115.25.74,147.32.84.229,13363', 'tcp,94.44.60.103,147.32.84.118,6881', 'tcp,140.115.25.74,147.32.84.229,443', 'tcp,140.115.25.74,147.32.84.229,80', 'tcp,140.115.25.74,147.32.84.229,13363', 'tcp,140.115.25.74,147.32.84.229,443', 'tcp,140.115.25.74,147.32.84.229,80', 'tcp,147.32.84.118,203.23.120.136,6881', 'tcp,122.174.15.39,147.32.84.2,80', 'tcp,85.132.162.9,147.32.84.118,6881', 'tcp,122.174.15.39,147.32.84.2,80', 

Step 4: Train the model
Step 4.1: Initialized
Average loss at step 0 :  24.55733871459961  from  1917  steps.
Percentage:  0.0  %
Percentage:  0.05216484089723526  %
Percentage:  0.10432968179447052  %
Percentage:  0.1564945226917058  %
Percentage:  0.20865936358894105  %
Percentage:  0.2608242044861763  %
Percentage:  0.3129890453834116  %
Percentage:  0.3651538862806468  %
Percentage:  0.4173187271778821  %
Percentage:  0.4694835680751174  %
Percentage:  0.5216484089723527  %
Percentage:  0.5738132498695879  %
Percentage:  0.6259780907668232  %
Percentage:  0.6781429316640585  %
Percentage:  0.7303077725612936  %
Percentage:  0.7824726134585289  %
Percentage:  0.8346374543557642  %
Percentage:  0.8868022952529995  %
Percentage:  0.9389671361502347  %
Percentage:  0.99113197704747  %
Percentage:  1.0432968179447053  %
Percentage:  1.0954616588419406  %
Percentage:  1.1476264997391759  %
Percentage:  1.1997913406364111  %
Percentage:  1.2519561815336464  %
Percentage:  1.30412102243088

Percentage:  13.041210224308816  %
Percentage:  13.09337506520605  %
Percentage:  13.145539906103286  %
Percentage:  13.197704747000522  %
Percentage:  13.249869587897757  %
Percentage:  13.302034428794991  %
Percentage:  13.354199269692227  %
Percentage:  13.406364110589463  %
Percentage:  13.458528951486699  %
Percentage:  13.510693792383933  %
Percentage:  13.562858633281168  %
Percentage:  13.615023474178404  %
Percentage:  13.66718831507564  %
Percentage:  13.719353155972874  %
Percentage:  13.77151799687011  %
Percentage:  13.823682837767345  %
Percentage:  13.87584767866458  %
Percentage:  13.928012519561815  %
Percentage:  13.98017736045905  %
Percentage:  14.032342201356286  %
Percentage:  14.084507042253522  %
Percentage:  14.136671883150756  %
Percentage:  14.188836724047992  %
Percentage:  14.241001564945227  %
Percentage:  14.293166405842463  %
Percentage:  14.345331246739697  %
Percentage:  14.397496087636933  %
Percentage:  14.449660928534168  %
Percentage:  14.501825769

Percentage:  25.66510172143975  %
Percentage:  25.717266562336984  %
Percentage:  25.76943140323422  %
Percentage:  25.821596244131456  %
Percentage:  25.87376108502869  %
Percentage:  25.925925925925927  %
Percentage:  25.97809076682316  %
Percentage:  26.030255607720395  %
Percentage:  26.082420448617633  %
Percentage:  26.134585289514867  %
Percentage:  26.1867501304121  %
Percentage:  26.238914971309338  %
Percentage:  26.291079812206572  %
Percentage:  26.34324465310381  %
Percentage:  26.395409494001044  %
Percentage:  26.447574334898277  %
Percentage:  26.499739175795515  %
Percentage:  26.55190401669275  %
Percentage:  26.604068857589983  %
Percentage:  26.65623369848722  %
Percentage:  26.708398539384454  %
Percentage:  26.760563380281692  %
Percentage:  26.812728221178926  %
Percentage:  26.86489306207616  %
Percentage:  26.917057902973397  %
Percentage:  26.96922274387063  %
Percentage:  27.021387584767865  %
Percentage:  27.073552425665103  %
Percentage:  27.125717266562337

Percentage:  38.60198226395409  %
Percentage:  38.65414710485133  %
Percentage:  38.70631194574857  %
Percentage:  38.7584767866458  %
Percentage:  38.810641627543035  %
Percentage:  38.86280646844027  %
Percentage:  38.91497130933751  %
Percentage:  38.967136150234744  %
Percentage:  39.01930099113198  %
Percentage:  39.07146583202921  %
Percentage:  39.123630672926446  %
Percentage:  39.17579551382368  %
Percentage:  39.22796035472092  %
Percentage:  39.280125195618155  %
Percentage:  39.33229003651539  %
Percentage:  39.38445487741262  %
Percentage:  39.436619718309856  %
Percentage:  39.4887845592071  %
Percentage:  39.54094940010433  %
Percentage:  39.593114241001565  %
Percentage:  39.6452790818988  %
Percentage:  39.69744392279603  %
Percentage:  39.749608763693274  %
Percentage:  39.80177360459051  %
Percentage:  39.85393844548774  %
Percentage:  39.906103286384976  %
Percentage:  39.95826812728221  %
Percentage:  40.010432968179444  %
Percentage:  40.062597809076685  %
Percent

Percentage:  51.17370892018779  %
Percentage:  51.225873761085026  %
Percentage:  51.27803860198227  %
Percentage:  51.3302034428795  %
Percentage:  51.382368283776735  %
Percentage:  51.43453312467397  %
Percentage:  51.4866979655712  %
Percentage:  51.53886280646844  %
Percentage:  51.59102764736568  %
Percentage:  51.64319248826291  %
Percentage:  51.695357329160146  %
Percentage:  51.74752217005738  %
Percentage:  51.799687010954614  %
Percentage:  51.851851851851855  %
Percentage:  51.90401669274909  %
Percentage:  51.95618153364632  %
Percentage:  52.00834637454356  %
Percentage:  52.06051121544079  %
Percentage:  52.11267605633803  %
Percentage:  52.164840897235266  %
Percentage:  52.2170057381325  %
Percentage:  52.26917057902973  %
Percentage:  52.32133541992697  %
Percentage:  52.3735002608242  %
Percentage:  52.42566510172144  %
Percentage:  52.477829942618676  %
Percentage:  52.52999478351591  %
Percentage:  52.582159624413144  %
Percentage:  52.63432446531038  %
Percentage

Percentage:  64.58007303077726  %
Percentage:  64.63223787167449  %
Percentage:  64.68440271257172  %
Percentage:  64.73656755346896  %
Percentage:  64.78873239436619  %
Percentage:  64.84089723526343  %
Percentage:  64.89306207616067  %
Percentage:  64.94522691705791  %
Percentage:  64.99739175795514  %
Percentage:  65.04955659885238  %
Percentage:  65.10172143974961  %
Percentage:  65.15388628064684  %
Percentage:  65.20605112154408  %
Percentage:  65.25821596244131  %
Percentage:  65.31038080333855  %
Percentage:  65.36254564423578  %
Percentage:  65.41471048513301  %
Percentage:  65.46687532603026  %
Percentage:  65.5190401669275  %
Percentage:  65.57120500782473  %
Percentage:  65.62336984872196  %
Percentage:  65.6755346896192  %
Percentage:  65.72769953051643  %
Percentage:  65.77986437141367  %
Percentage:  65.8320292123109  %
Percentage:  65.88419405320813  %
Percentage:  65.93635889410537  %
Percentage:  65.9885237350026  %
Percentage:  66.04068857589985  %
Percentage:  66.09

Percentage:  77.41262389149713  %
Percentage:  77.46478873239437  %
Percentage:  77.5169535732916  %
Percentage:  77.56911841418884  %
Percentage:  77.62128325508607  %
Percentage:  77.6734480959833  %
Percentage:  77.72561293688054  %
Percentage:  77.77777777777777  %
Percentage:  77.82994261867502  %
Percentage:  77.88210745957225  %
Percentage:  77.93427230046949  %
Percentage:  77.98643714136672  %
Percentage:  78.03860198226396  %
Percentage:  78.09076682316119  %
Percentage:  78.14293166405842  %
Percentage:  78.19509650495566  %
Percentage:  78.24726134585289  %
Percentage:  78.29942618675013  %
Percentage:  78.35159102764736  %
Percentage:  78.40375586854461  %
Percentage:  78.45592070944184  %
Percentage:  78.50808555033908  %
Percentage:  78.56025039123631  %
Percentage:  78.61241523213354  %
Percentage:  78.66458007303078  %
Percentage:  78.71674491392801  %
Percentage:  78.76890975482524  %
Percentage:  78.82107459572248  %
Percentage:  78.87323943661971  %
Percentage:  78.

Percentage:  90.0886802295253  %
Percentage:  90.14084507042253  %
Percentage:  90.19300991131978  %
Percentage:  90.24517475221701  %
Percentage:  90.29733959311424  %
Percentage:  90.34950443401148  %
Percentage:  90.40166927490871  %
Percentage:  90.45383411580595  %
Percentage:  90.50599895670318  %
Percentage:  90.55816379760041  %
Percentage:  90.61032863849765  %
Percentage:  90.66249347939488  %
Percentage:  90.71465832029212  %
Percentage:  90.76682316118936  %
Percentage:  90.8189880020866  %
Percentage:  90.87115284298383  %
Percentage:  90.92331768388107  %
Percentage:  90.9754825247783  %
Percentage:  91.02764736567553  %
Percentage:  91.07981220657277  %
Percentage:  91.13197704747  %
Percentage:  91.18414188836724  %
Percentage:  91.23630672926447  %
Percentage:  91.2884715701617  %
Percentage:  91.34063641105895  %
Percentage:  91.39280125195619  %
Percentage:  91.44496609285342  %
Percentage:  91.49713093375065  %
Percentage:  91.54929577464789  %
Percentage:  91.60146

In [43]:

W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, EMBEDDING_DIM]))
b1 = tf.Variable(tf.random_normal([1])) #bias
hidden_layer = tf.add(tf.matmul(x,W1), b1)


vectors = sess.run(nce_weights + nce_biases)
ONE_HOT_DIM = len(data)

# word embedding will be 2 dimension for 2d visualization
EMBEDDING_DIM = 2

# hidden layer: which represents word vector eventually
#W1 = tf.Variable(tf.random_normal([ONE_HOT_DIM, embedding_size]))
#b1 = tf.Variable(tf.random_normal([1])) #bias

# Now the hidden layer (W1 + b1) is actually the word look up table
#vectors = session.run(W1 + b1)
#print(vectors)

w2v_df = pd.DataFrame(to_save, columns = ['x1', 'x2'])
w2v_df['word'] = fobi
w2v_df = w2v_df[['word', 'x1', 'x2']]
w2v_df


import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Fixing random state for reproducibility
import matplotlib.pyplot as plt

fig, ax = plt.subplots()

for words, x1, x2 in zip(w2v_df['word'], w2v_df['x1'], w2v_df['x2']):
    ax.scatter(x1,x2,marker='o',c='blue')
    ax.annotate(words, (x1,x2 ))
    
    
PADDING = 1.0
x_axis_min = np.amin(vectors, axis=0)[0] - PADDING
y_axis_min = np.amin(vectors, axis=0)[1] - PADDING
x_axis_max = np.amax(vectors, axis=0)[0] + PADDING
y_axis_max = np.amax(vectors, axis=0)[1] + PADDING
 
plt.xlim(x_axis_min,x_axis_max)
plt.ylim(y_axis_min,y_axis_max)
plt.rcParams["figure.figsize"] = (10,10)
plt.rcParams["figure.dpi"] = (2)
plt.rcParams['axes.unicode_minus'] = False

plt.show()


ValueError: Dimensions must be equal, but are 2 and 32 for 'add' (op: 'Add') with input shapes: [32,2], [32].