In [1]:
import pandas as pd
import numpy as np
domain_df = pd.read_csv("../../data/initial_data.csv")
expanded_df = pd.read_csv("../../data/new_dateset.csv")
df = pd.concat([domain_df,expanded_df])
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj.com,dga
1,wi-wamss.org,benign
2,qcxfurnkbqidxxcl.biz,dga
3,192-168-1-1-admin.ru,benign
4,dblsiobnkjxomkmh.ru,dga


In [2]:
df['domain_type'] = pd.Categorical(df['domain_type'])

In [3]:
df['domain_type'] = df.domain_type.cat.codes
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj.com,1
1,wi-wamss.org,0
2,qcxfurnkbqidxxcl.biz,1
3,192-168-1-1-admin.ru,0
4,dblsiobnkjxomkmh.ru,1


In [4]:
import tldextract
def transform(val):
    return tldextract.extract(val).domain
df['domain']= df['domain'].apply(transform)
df.head()

Unnamed: 0,domain,domain_type
0,wisuolycossttqrj,1
1,wi-wamss,0
2,qcxfurnkbqidxxcl,1
3,192-168-1-1-admin,0
4,dblsiobnkjxomkmh,1


In [5]:
target = df.pop('domain_type')
dataset = df['domain']

In [6]:
from tensorflow.keras.preprocessing import sequence
# valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(dataset)))}
# max_features = len(valid_chars) + 1
# maxlen = np.max([len(x) for x in dataset])
valid_chars = {'y': 1, 'g': 2, '9': 3, '6': 4, '-': 5, '5': 6, 'r': 7, 'l': 8, 'v': 9, 'i': 10, 'p': 11, 'f': 12, '4': 13, '1': 14, 'n': 15, '2': 16,
 's': 17, 'm': 18, 'j': 19, 't': 20, 'z': 21, 'o': 22, 'q': 23, 'a': 24, 'u': 25, '3': 26, '8': 27, 'c': 28, 'b': 29, 'w': 30, 'd': 31, 'e': 32,
 '0': 33, 'x': 34, 'h': 35, '7': 36, 'k': 37}
maxlen = 50

# Convert characters to int and pad
X = [[valid_chars[y] for y in x] for x in dataset]
X = sequence.pad_sequences(X, maxlen=maxlen)
y = target

In [15]:
valid_chars

{'y': 1,
 'g': 2,
 '9': 3,
 '6': 4,
 '-': 5,
 '5': 6,
 'r': 7,
 'l': 8,
 'v': 9,
 'i': 10,
 'p': 11,
 'f': 12,
 '4': 13,
 '1': 14,
 'n': 15,
 '2': 16,
 's': 17,
 'm': 18,
 'j': 19,
 't': 20,
 'z': 21,
 'o': 22,
 'q': 23,
 'a': 24,
 'u': 25,
 '3': 26,
 '8': 27,
 'c': 28,
 'b': 29,
 'w': 30,
 'd': 31,
 'e': 32,
 '0': 33,
 'x': 34,
 'h': 35,
 '7': 36,
 'k': 37}

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.shape

(117985, 50)

In [9]:
import tensorflow as tf
trainset = tf.data.Dataset.from_tensor_slices((X_train, y_train.values)).shuffle(len(X_train)).batch(128)

In [11]:
def get_compiled_model():
    main_input = tf.keras.Input(shape=(50, ), dtype='int32', name='main_input')
    embedding = tf.keras.layers.Embedding(input_dim=128, output_dim=128,input_length=50)(main_input)
    bi_lstm = tf.keras.layers.Bidirectional(layer=tf.keras.layers.LSTM(64,return_sequences=False),merge_mode='concat')(embedding)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(bi_lstm)
    model = tf.keras.Model(inputs=main_input, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Recall()])
    return model

earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=1)
# callback_parameter = callbacks=[earlystop_callback], validation_data=testset, validation_freq=1
model = get_compiled_model()
model.fit(trainset, epochs=25, callbacks=[earlystop_callback], validation_data=testset, validation_freq=1)

Train for 922 steps, validate for 396 steps
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25


<tensorflow.python.keras.callbacks.History at 0x62120997b8>

In [20]:
model.save('tf_model__')

INFO:tensorflow:Assets written to: tf_model__\assets


In [20]:
!tar czvf model_.tar.gz tf_model__

tf_model_/
tf_model_/1/
tf_model_/1/assets/
tf_model_/1/saved_model.pb
tf_model_/1/variables/
tf_model_/1/variables/variables.data-00000-of-00001
tf_model_/1/variables/variables.index


In [10]:
testset = tf.data.Dataset.from_tensor_slices((X_test, y_test.values)).batch(128)

In [32]:
model.evaluate(testset)



[0.12247108168252674, 0.9622869, 0.96183026]

In [12]:
import pandas as pd
import numpy as np
test_df = pd.read_csv("../../data/feedback.csv")

In [13]:
test_df.columns

Index(['domain', ' submitted', ' correct'], dtype='object')

In [14]:
test_df[' correct'] = pd.Categorical(test_df[' correct'])
test_df[' correct'] = test_df[' correct'].cat.codes
test_df.head()

Unnamed: 0,domain,submitted,correct
0,cfsjdqwdhmwkiv,benign,1
1,uocoqgiusyeciouaimcauykqswsymo,benign,1
2,okdbnxoauhzrawyu,benign,1
3,m8sdetc0u81lgdcpshoxsvy,benign,1
4,qkdccn,benign,1


In [15]:
real_target = test_df.pop(' correct')
real_dataset = test_df['domain']

In [16]:
real_X = [[valid_chars[y] for y in x] for x in real_dataset]
real_X = sequence.pad_sequences(real_X, maxlen=maxlen)
real_y = real_target

In [17]:
realset = tf.data.Dataset.from_tensor_slices((real_X, real_y.values)).batch(128)

In [18]:
model.evaluate(realset)



[0.5535051822662354, 0.83966243, 0.83549786]

In [34]:
import sklearn
probs = model.predict(realset)
sklearn.metrics.confusion_matrix(real_y.values, np.argmax(probs,axis=1))

array([[  6,   0],
       [231,   0]], dtype=int64)