In [1]:
import pandas as pd
import numpy as np
domain_df = pd.read_csv("../../data/initial_data.csv")
expanded_df = pd.read_csv("../../data/expanded_data.csv")
df = pd.concat([domain_df,expanded_df])
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj.com,dga
1,wi-wamss.org,benign
2,qcxfurnkbqidxxcl.biz,dga
3,192-168-1-1-admin.ru,benign
4,dblsiobnkjxomkmh.ru,dga


In [2]:
df['domain_type'] = pd.Categorical(df['domain_type'])

In [3]:
df['domain_type'] = df.domain_type.cat.codes
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj.com,1
1,wi-wamss.org,0
2,qcxfurnkbqidxxcl.biz,1
3,192-168-1-1-admin.ru,0
4,dblsiobnkjxomkmh.ru,1


In [4]:
import tldextract
def transform(val):
    return tldextract.extract(val).domain
df['domain']= df['domain'].apply(transform)
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj,1
1,wi-wamss,0
2,qcxfurnkbqidxxcl,1
3,192-168-1-1-admin,0
4,dblsiobnkjxomkmh,1


In [5]:
target = df.pop('domain_type')
dataset = df['domain']

In [6]:
from tensorflow.keras.preprocessing import sequence
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(dataset)))}
max_features = len(valid_chars) + 1
maxlen = np.max([len(x) for x in dataset])

# Convert characters to int and pad
X = [[valid_chars[y] for y in x] for x in dataset]
X = sequence.pad_sequences(X, maxlen=maxlen)
y = target

In [15]:
valid_chars

{'y': 1,
 'g': 2,
 '9': 3,
 '6': 4,
 '-': 5,
 '5': 6,
 'r': 7,
 'l': 8,
 'v': 9,
 'i': 10,
 'p': 11,
 'f': 12,
 '4': 13,
 '1': 14,
 'n': 15,
 '2': 16,
 's': 17,
 'm': 18,
 'j': 19,
 't': 20,
 'z': 21,
 'o': 22,
 'q': 23,
 'a': 24,
 'u': 25,
 '3': 26,
 '8': 27,
 'c': 28,
 'b': 29,
 'w': 30,
 'd': 31,
 'e': 32,
 '0': 33,
 'x': 34,
 'h': 35,
 '7': 36,
 'k': 37}

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
import tensorflow as tf
trainset = tf.data.Dataset.from_tensor_slices((X_train, y_train.values)).shuffle(len(X_train)).batch(128)

In [12]:
def get_compiled_model():
    main_input = tf.keras.Input(shape=(50, ), dtype='int32', name='main_input')
    embedding = tf.keras.layers.Embedding(input_dim=128, output_dim=128,input_length=50)(main_input)
    bi_lstm = tf.keras.layers.Bidirectional(layer=tf.keras.layers.LSTM(64,return_sequences=False),merge_mode='concat')(embedding)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(bi_lstm)
    model = tf.keras.Model(inputs=main_input, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Recall()])
    return model

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=1)

model = get_compiled_model()
model.fit(trainset, epochs=15, callbacks=[earlystop_callback], validation_data=testset, validation_freq=1)

Train for 434 steps, validate for 186 steps
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


NotFoundError: Failed to create a directory: ../../data/tf_model\variables; No such file or directory

In [19]:
model.save('tf_model')

INFO:tensorflow:Assets written to: tf_model\assets


In [20]:
!tar czvf model.tar.gz tf_model

tar: tfmodel: Cannot stat: No such file or directory
tar: Exiting with failure status due to previous errors


In [11]:
testset = tf.data.Dataset.from_tensor_slices((X_test, y_test.values)).batch(128)

In [17]:
model.evaluate(testset)



[0.11977213353759819, 0.9623744, 0.9529618]

In [21]:
import pandas as pd
import numpy as np
test_df = pd.read_csv("../../data/feedback.csv")

In [22]:
test_df.columns

Index(['domain', ' submitted', ' correct'], dtype='object')

In [23]:
test_df[' correct'] = pd.Categorical(test_df[' correct'])
test_df[' correct'] = test_df[' correct'].cat.codes
test_df.head()

Unnamed: 0,domain,submitted,correct
0,cfsjdqwdhmwkiv,benign,1
1,uocoqgiusyeciouaimcauykqswsymo,benign,1
2,okdbnxoauhzrawyu,benign,1
3,m8sdetc0u81lgdcpshoxsvy,benign,1
4,qkdccn,benign,1


In [24]:
real_target = test_df.pop(' correct')
real_dataset = test_df['domain']

In [25]:
real_X = [[valid_chars[y] for y in x] for x in real_dataset]
real_X = sequence.pad_sequences(real_X, maxlen=maxlen)
real_y = real_target

In [26]:
realset = tf.data.Dataset.from_tensor_slices((real_X, real_y.values)).batch(128)

In [32]:
model.evaluate(realset)



[0.5734701454639435, 0.78902954, 0.7878788]

In [34]:
import sklearn
probs = model.predict(realset)
sklearn.metrics.confusion_matrix(real_y.values, np.argmax(probs,axis=1))

array([[  6,   0],
       [231,   0]], dtype=int64)