In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('pool-out.txt', sep='\t')

In [3]:
df.head()

Unnamed: 0,IP,pages
0,1.0.180.125,/components
1,1.1.49.175,"/effective,/community,/treatment,/professional..."
2,1.10.61.147,"/especially,/application,/especially,/associat..."
3,1.10.9.113,"/enterprise,/something"
4,1.103.207.46,"/authority,/discussion,/associates,/membership..."


In [4]:
data = df.pages.apply(lambda x: x.split(','))

In [5]:
MAX_LEN = max([len(x) for x in data])

In [7]:
MAX_LEN = 10

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
all_data = ','.join(df.pages)
le = LabelEncoder()
le.fit(all_data.split(','))

LabelEncoder()

In [10]:
len(le.classes_)

200

In [11]:
for i, item in enumerate(data):
    data[i] = le.transform(item)

In [12]:
data = data + 1

In [13]:
X = []
y = []
for item in data:
    if len(item) > 1:
        X.append(item[:-1])
        y.append(item[-1])

In [14]:
l_d = []
for i, item in enumerate(X):
    l_d.append(len(X[i]))
    tmp = list(X[i])
    while len(tmp) < MAX_LEN:
        tmp.append(np.int64(0))
    tmp = tmp[:MAX_LEN]
    X[i] = tmp

In [16]:
X = np.array(X)
y = np.array(y)
l_d = np.array(l_d)

In [17]:
np.random.seed(42)
ind = np.arange(len(X))
np.random.shuffle(ind)

X = X[ind]
y = y[ind]
l_d = l_d[ind]

train_data = X[:int(len(X)*0.8)]
train_y = y[:int(len(y)*0.8)]
train_l_d = l_d[:int(len(l_d)*0.8)]

test_data = X[int(len(X)*0.8):]
test_y = y[int(len(y)*0.8):]
test_l_d = l_d[int(len(l_d)*0.8):]

In [18]:
train_y = train_y.reshape(-1, 1)
test_y = test_y.reshape(-1, 1)

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
oh = OneHotEncoder(n_values='auto', sparse=False)
oh.fit(list(train_y))  

OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=False)

In [21]:
train_y = oh.transform(train_y)
test_y = oh.transform(test_y)

# Model

In [22]:
import tensorflow as tf

In [23]:
def get_batch(X, y, l_d, batch_size, rnd=True):
    b_X = [] 
    b_y = []
    b_X_sizes = [] 
    
    if rnd:
        ind = np.arange(len(X))
        np.random.shuffle(ind)
        X = X[ind]
        y = y[ind]
        l_d = l_d[ind]
    
    i = 0
    for x, l, s in zip(X, y, l_d): 
        if len(b_X) == batch_size:
            b_X = [] 
            b_y = []
            b_X_sizes = [] 
            
        b_X.append(x)
        b_y.append(l)
        b_X_sizes.append(s)
        i += 1
        
        if i+1 > len(X):
            yield np.array(b_X), np.array(b_y), np.array(b_X_sizes)
        
        if len(b_X) < batch_size:
            continue
                
        yield np.array(b_X), np.array(b_y), np.array(b_X_sizes)

# LSTM

In [26]:
tf.reset_default_graph()

In [27]:
dim = 50
embedding_matrix = np.random.uniform(-np.sqrt(3/dim), np.sqrt(3/dim), (len(le.classes_)+1, dim))

In [28]:
pages = tf.placeholder(tf.int32, shape=[None, MAX_LEN]) # Input data
pages_lengths = tf.placeholder(tf.int32, shape=[None]) # Real langths for inpur data - for padding

dropout = tf.placeholder(dtype=tf.float32, shape=[]) # Placeholder to set dropout amount

W = tf.Variable(embedding_matrix, dtype=tf.float32)
embeddings = tf.nn.embedding_lookup(W, pages) # Use embeddings for pages - it allow to use additional info

embeddings = tf.nn.dropout(embeddings, dropout) # Add dropout

# Create LSTM layer with masking padds and set it to return only last out
lstm_cell = tf.nn.rnn_cell.LSTMCell(200, state_is_tuple=True)
_, (_, out) = tf.nn.dynamic_rnn(lstm_cell, embeddings, sequence_length=pages_lengths, dtype=tf.float32)

out = tf.nn.dropout(out, dropout) # Add dropout

# Addinionaly layer for deeper representation
out = tf.layers.dense(out, 100, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.he_normal())
out = tf.nn.dropout(out, dropout) # Add dropout

# Output layer with number of units equal to number of classes
# Use no activation because of softmax_cross_entropy_with_logits
out = tf.layers.dense(out, len(le.classes_)) 
ans = tf.nn.softmax(out) # Add softmax activation for used output of NN

labels = tf.placeholder(tf.int32, shape=[None, len(le.classes_)]) # Real next pages for train
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=out))

# Compute accuracy
correct_predictions = tf.equal(tf.argmax(labels, 1), tf.argmax(ans, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'))

optimizer = tf.train.AdamOptimizer(0.001)
gvs = optimizer.compute_gradients(loss)
capped_gvs = [(tf.clip_by_norm(grad, 1.0), var) for grad, var in gvs] # Because our NN is Recurrent
train_op = optimizer.apply_gradients(capped_gvs)

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See tf.nn.softmax_cross_entropy_with_logits_v2.

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [29]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [30]:
sess.run(tf.global_variables_initializer())

In [31]:
batch_size = 128
for ep in range(200):
    all_loss = []
    all_ac = []
    all_len = len(train_data)
    i = 0
    for b_data, b_labels, b_data_sizes in get_batch(train_data, train_y, train_l_d, batch_size):
        _, batch_loss, batch_ac = sess.run([train_op, loss, accuracy], feed_dict={pages:b_data, labels:b_labels,
                                                              pages_lengths:b_data_sizes,
                                                              dropout:0.5})
        all_loss.append(batch_loss)
        all_ac.append(batch_ac)
        i += 1
        print('\r%d/%d' % (i*batch_size, all_len), end='')
        
        
    ls, ac = sess.run([loss, accuracy], feed_dict={labels:test_y, pages:test_data, pages_lengths:test_l_d, dropout:1.0})      
    
    print(' Epoch: %d, loss: %f, test_loss: %f' % (ep+1, np.mean(all_loss), ls), end='')
    print(' acc: %f, test_acc: %f' % (np.mean(all_ac), ac))

20864/20858 Epoch: 1, loss: 5.185360, test_loss: 5.111279 acc: 0.010746, test_acc: 0.011122
20864/20858 Epoch: 2, loss: 5.105743, test_loss: 5.101874 acc: 0.009878, test_acc: 0.012272
20864/20858 Epoch: 3, loss: 5.095975, test_loss: 5.100788 acc: 0.011266, test_acc: 0.011505
20864/20858 Epoch: 4, loss: 5.091166, test_loss: 5.099844 acc: 0.011798, test_acc: 0.010930
20864/20858 Epoch: 5, loss: 5.083949, test_loss: 5.098670 acc: 0.011611, test_acc: 0.009779
20864/20858 Epoch: 6, loss: 5.081219, test_loss: 5.099067 acc: 0.012272, test_acc: 0.010355
20864/20858 Epoch: 7, loss: 5.077614, test_loss: 5.100949 acc: 0.013518, test_acc: 0.010930
20864/20858 Epoch: 8, loss: 5.075349, test_loss: 5.102275 acc: 0.012229, test_acc: 0.009779
20864/20858 Epoch: 9, loss: 5.069973, test_loss: 5.104515 acc: 0.012510, test_acc: 0.008821
20864/20858 Epoch: 10, loss: 5.062792, test_loss: 5.105771 acc: 0.014338, test_acc: 0.011314
20864/20858 Epoch: 11, loss: 5.059787, test_loss: 5.107564 acc: 0.013382, test_

20864/20858 Epoch: 90, loss: 4.779900, test_loss: 5.267778 acc: 0.034996, test_acc: 0.010930
20864/20858 Epoch: 91, loss: 4.786793, test_loss: 5.263425 acc: 0.037061, test_acc: 0.009588
20864/20858 Epoch: 92, loss: 4.776484, test_loss: 5.270391 acc: 0.037550, test_acc: 0.009971
20864/20858 Epoch: 93, loss: 4.770185, test_loss: 5.268651 acc: 0.037732, test_acc: 0.009012
20864/20858 Epoch: 94, loss: 4.777714, test_loss: 5.270579 acc: 0.034413, test_acc: 0.011314
20864/20858 Epoch: 95, loss: 4.768670, test_loss: 5.268251 acc: 0.038842, test_acc: 0.009204
20864/20858 Epoch: 96, loss: 4.780897, test_loss: 5.266306 acc: 0.038542, test_acc: 0.010163
20864/20858 Epoch: 97, loss: 4.774036, test_loss: 5.269674 acc: 0.037493, test_acc: 0.009204
20864/20858 Epoch: 98, loss: 4.755531, test_loss: 5.278616 acc: 0.038116, test_acc: 0.009779
20864/20858 Epoch: 99, loss: 4.760998, test_loss: 5.274628 acc: 0.040284, test_acc: 0.009779
20864/20858 Epoch: 100, loss: 4.759270, test_loss: 5.279488 acc: 0.038

20864/20858 Epoch: 178, loss: 4.599121, test_loss: 5.416360 acc: 0.056389, test_acc: 0.008821
20864/20858 Epoch: 179, loss: 4.584983, test_loss: 5.431433 acc: 0.057625, test_acc: 0.011314
20864/20858 Epoch: 180, loss: 4.595981, test_loss: 5.433715 acc: 0.058589, test_acc: 0.009971
20864/20858 Epoch: 181, loss: 4.578475, test_loss: 5.442799 acc: 0.057760, test_acc: 0.008437
20864/20858 Epoch: 182, loss: 4.582582, test_loss: 5.430377 acc: 0.057043, test_acc: 0.010355
20864/20858 Epoch: 183, loss: 4.587552, test_loss: 5.426773 acc: 0.055222, test_acc: 0.009779
20864/20858 Epoch: 184, loss: 4.590492, test_loss: 5.416551 acc: 0.054793, test_acc: 0.008821
20864/20858 Epoch: 185, loss: 4.576300, test_loss: 5.435025 acc: 0.057676, test_acc: 0.009204
20864/20858 Epoch: 186, loss: 4.580768, test_loss: 5.436173 acc: 0.055080, test_acc: 0.009396
20864/20858 Epoch: 187, loss: 4.584419, test_loss: 5.446425 acc: 0.058874, test_acc: 0.008821
20864/20858 Epoch: 188, loss: 4.569860, test_loss: 5.440412 

# CNN

In [32]:
tf.reset_default_graph()

In [33]:
dim = 50
embedding_matrix = np.random.uniform(-np.sqrt(3/dim), np.sqrt(3/dim), (len(le.classes_)+1, dim))

In [34]:
pages = tf.placeholder(tf.int32, shape=[None, MAX_LEN]) # Input data

dropout = tf.placeholder(dtype=tf.float32, shape=[]) # Placeholder to set dropout amount

W = tf.Variable(embedding_matrix, dtype=tf.float32)
embeddings = tf.nn.embedding_lookup(W, pages) # Use embeddings for pages - it allow to use additional info

embeddings = tf.nn.dropout(embeddings, dropout) # Add dropout
embeddings = tf.reshape(embeddings, [-1, 10, 50, 1])

# CNN layers
cnn = tf.layers.Conv2D(512, kernel_size=(3, 50), strides=1, padding='valid', activation=tf.nn.tanh, 
                       kernel_initializer=tf.keras.initializers.glorot_normal())(embeddings)
cnn = tf.layers.batch_normalization(cnn) 
#cnn = tf.nn.dropout(cnn, dropout)
cnn = tf.reshape(cnn, [-1, 8, 512])

cnn = tf.layers.Conv1D(512, kernel_size=3, strides=1, padding='valid', activation=tf.nn.relu, 
                             kernel_initializer=tf.keras.initializers.he_normal())(cnn)
cnn = tf.layers.batch_normalization(cnn) 
#cnn = tf.nn.dropout(cnn, dropout)
cnn = tf.layers.Conv1D(256, kernel_size=3, strides=1, padding='valid', activation=tf.nn.relu, 
                             kernel_initializer=tf.keras.initializers.he_normal())(cnn)
cnn = tf.layers.batch_normalization(cnn) 
#cnn = tf.nn.dropout(cnn, dropout)
cnn = tf.layers.Conv1D(128, kernel_size=3, strides=1, padding='valid', activation=tf.nn.relu, 
                             kernel_initializer=tf.keras.initializers.he_normal())(cnn)
cnn = tf.layers.batch_normalization(cnn) 

cnn = tf.layers.MaxPooling1D(pool_size=2, strides=2)(cnn)

cnn = tf.layers.Flatten()(cnn)

out = tf.layers.dense(cnn, 64, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.he_normal())
#out = tf.nn.dropout(out, dropout) # Add dropout
#out = tf.layers.dense(out, 128, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.he_normal())
#out = tf.nn.dropout(out, dropout) # Add dropout

# Output layer with number of units equal to number of classes
# Use no activation because of softmax_cross_entropy_with_logits
out = tf.layers.dense(out, len(le.classes_)) 
ans = tf.nn.softmax(out) # Add softmax activation for use output of NN

labels = tf.placeholder(tf.int32, shape=[None, len(le.classes_)]) # Real next pages for train
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=labels, logits=out))

# Compute accuracy
correct_predictions = tf.equal(tf.argmax(labels, 1), tf.argmax(ans, 1))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, 'float'))

optimizer = tf.train.AdamOptimizer(0.001)
train_op = optimizer.minimize(loss)

In [35]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)

In [36]:
sess.run(tf.global_variables_initializer())

In [37]:
batch_size = 128
for ep in range(200):
    all_loss = []
    all_ac = []
    all_len = len(train_data)
    i = 0
    for b_data, b_labels, b_data_sizes in get_batch(train_data, train_y, train_l_d, batch_size):
        _, batch_loss, batch_ac = sess.run([train_op, loss, accuracy], feed_dict={pages:b_data, labels:b_labels,
                                                                                  dropout:0.5})
        all_loss.append(batch_loss)
        all_ac.append(batch_ac)
        i += 1
        print('\r%d/%d' % (i*batch_size, all_len), end='')
        
        
    ls, ac = sess.run([loss, accuracy], feed_dict={pages:test_data, labels:test_y, dropout:1.0})      
    
    print(' Epoch: %d, loss: %f, test_loss: %f' % (ep+1, np.mean(all_loss), ls), end='')
    print(' acc: %f, test_acc: %f' % (np.mean(all_ac), ac))

20864/20858 Epoch: 1, loss: 5.148835, test_loss: 5.102962 acc: 0.009682, test_acc: 0.011122
20864/20858 Epoch: 2, loss: 5.092948, test_loss: 5.103763 acc: 0.010880, test_acc: 0.011505
20864/20858 Epoch: 3, loss: 5.090062, test_loss: 5.101890 acc: 0.010741, test_acc: 0.009204
20864/20858 Epoch: 4, loss: 5.087626, test_loss: 5.102482 acc: 0.011793, test_acc: 0.011314
20864/20858 Epoch: 5, loss: 5.085680, test_loss: 5.101879 acc: 0.011074, test_acc: 0.009396
20864/20858 Epoch: 6, loss: 5.083214, test_loss: 5.108016 acc: 0.012517, test_acc: 0.011314
20864/20858 Epoch: 7, loss: 5.078872, test_loss: 5.107662 acc: 0.011985, test_acc: 0.011122
20864/20858 Epoch: 8, loss: 5.076642, test_loss: 5.112923 acc: 0.012704, test_acc: 0.011505
20864/20858 Epoch: 9, loss: 5.072339, test_loss: 5.107238 acc: 0.012462, test_acc: 0.012464
20864/20858 Epoch: 10, loss: 5.066676, test_loss: 5.109479 acc: 0.013950, test_acc: 0.013039
20864/20858 Epoch: 11, loss: 5.058431, test_loss: 5.116278 acc: 0.015153, test_

20864/20858 Epoch: 90, loss: 2.044392, test_loss: 10.065938 acc: 0.516823, test_acc: 0.008245
20864/20858 Epoch: 91, loss: 2.007464, test_loss: 10.457519 acc: 0.522824, test_acc: 0.007862
20864/20858 Epoch: 92, loss: 1.987570, test_loss: 10.571976 acc: 0.527658, test_acc: 0.009396
20864/20858 Epoch: 93, loss: 1.929759, test_loss: 10.648362 acc: 0.542355, test_acc: 0.006328
20864/20858 Epoch: 94, loss: 1.918874, test_loss: 10.930301 acc: 0.542626, test_acc: 0.007478
20864/20858 Epoch: 95, loss: 1.884022, test_loss: 10.830804 acc: 0.549132, test_acc: 0.007478
20864/20858 Epoch: 96, loss: 1.834244, test_loss: 11.028135 acc: 0.563576, test_acc: 0.008437
20864/20858 Epoch: 97, loss: 1.814580, test_loss: 11.058257 acc: 0.570243, test_acc: 0.010547
20864/20858 Epoch: 98, loss: 1.791097, test_loss: 11.183058 acc: 0.574583, test_acc: 0.006136
20864/20858 Epoch: 99, loss: 1.775596, test_loss: 11.428587 acc: 0.579529, test_acc: 0.007862
20864/20858 Epoch: 100, loss: 1.737288, test_loss: 11.267939

20864/20858 Epoch: 177, loss: 0.871556, test_loss: 17.288116 acc: 0.800696, test_acc: 0.007478
20864/20858 Epoch: 178, loss: 0.845267, test_loss: 17.602947 acc: 0.804087, test_acc: 0.007478
20864/20858 Epoch: 179, loss: 0.853503, test_loss: 17.592316 acc: 0.802225, test_acc: 0.007095
20864/20858 Epoch: 180, loss: 0.851883, test_loss: 17.587978 acc: 0.802040, test_acc: 0.004986
20864/20858 Epoch: 181, loss: 0.846989, test_loss: 17.785667 acc: 0.803214, test_acc: 0.006136
20864/20858 Epoch: 182, loss: 0.857224, test_loss: 17.377731 acc: 0.802417, test_acc: 0.006520
20864/20858 Epoch: 183, loss: 0.849758, test_loss: 17.310322 acc: 0.805788, test_acc: 0.007287
20864/20858 Epoch: 184, loss: 0.834519, test_loss: 17.532349 acc: 0.809714, test_acc: 0.005561
20864/20858 Epoch: 185, loss: 0.831803, test_loss: 17.934048 acc: 0.807502, test_acc: 0.006711
20864/20858 Epoch: 186, loss: 0.835229, test_loss: 18.109716 acc: 0.806551, test_acc: 0.007862
20864/20858 Epoch: 187, loss: 0.824462, test_loss:

# RandomForest

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import classification_report

In [None]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(train_data, train_y)

In [69]:
res = rf.predict(train_data)
acc(train_y, res)

0.801375

In [70]:
res = rf.predict(test_data)
acc(test_y, res)

0.7115

# Boosting

In [71]:
from sklearn.ensemble import GradientBoostingClassifier

In [32]:
gb = RandomForestClassifier(n_estimators=200, random_state=42)
gb.fit(train_data, train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [33]:
res = gb.predict(train_data)
acc(train_y, res)


array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [9]:
print(classification_report(res, train_y, target_names=list(le.classes_)))

NameError: name 'res' is not defined

In [74]:
res = gb.predict(test_data)
acc(test_y, res)


0.7115

In [93]:
print(classification_report(res, test_y, target_names=list(le.classes_)))

                                                                                                  precision    recall  f1-score   support

                                                                                      /ShopSmall       0.00      0.00      0.00         0
                                                                             /account-management       0.00      0.00      0.00         0
                                                                         /account-management/PIN       0.03      0.25      0.05        12
                                                            /account-management/card-replacement       0.00      0.00      0.00         0
                                                           /account-management/card-replacement/       0.00      0.00      0.00         0
                                                        /account-management/check-spending-power       0.00      0.00      0.00         0
                                 

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# HMM

In [308]:
from hmmlearn import hmm

In [322]:
model = hmm.GaussianHMM(len(le.classes_))
model.fit(train_data) 



KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score as acc

In [315]:
pr = model.predict(train_data)
acc(train_y, pr)

  n_samples, n_components, np.log(self.startprob_),


In [None]:
pr = model.predict(test_data)
acc(test_y, pr)