In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import tldextract
import math
from collections import Counter
from itertools import groupby

### Loading the data and concatenating additional sources

In [2]:
data = pd.read_csv('final_data.csv.gz', index_col=0)

  mask |= (ar1 == a)


In [3]:
additional_1 = pd.read_csv('additional_1.csv')
additional_1['domain'] = additional_1['domain'].apply(lambda x: tldextract.extract(x).domain)
additional_2 = pd.read_csv('additional_2.csv')
additional_2['domain'] = additional_2['domain'].apply(lambda x: tldextract.extract(x).domain)
additional_3 = pd.read_csv('additional_3.csv')
additional_3['domain'] = additional_3['domain'].apply(lambda x: tldextract.extract(x).domain)

In [4]:
data = pd.concat([data, additional_1, additional_2, additional_3])

### Dropping duplicates and NAs

In [5]:
data = data.drop_duplicates(subset = 'domain', keep = 'first').reset_index().drop(columns = ['index'])

In [6]:
data_copy = data.copy()

In [7]:
data_copy

Unnamed: 0,domain,label
0,cinb2x5flsfy2,dga
1,triplestar,benign
2,nas2,benign
3,jwtjnmsddx,dga
4,mersinuntitamalletsolso,dga
...,...,...
3908520,kugaquayiukkmkc,dga
3908521,sun-glo,benign
3908522,weewddtlrlv,dga
3908523,cmqmlane,dga


In [8]:
data.label.value_counts()

benign    1954331
dga       1951838
Name: label, dtype: int64

In [9]:
data = data.dropna()

### Creating a dictionary for lookup

In [10]:
# Dictionary for encoding domain names
dictionary = {'a': 5, 'b': 9, 'c': 17, 'd': 30, 'e': 22, 'f': 2, 'g': 35, 'h': 19, 
            'i': 12, 'j': 28, 'k': 20, 'l': 24, 'm': 10, 'n': 13, 'o': 7, 
            'p': 26, 'q': 4, 'r': 37, 's': 11, 't': 15, 
            'u': 16, 'v': 25, 'w': 6, 'x': 8, 'y': 1, 'z': 3, 
            '0': 36, '1': 23, '2': 31, '3': 33, '4': 27, '5': 29, 
            '6': 38, '7': 32, '8': 14, '9': 21, '-': 39, '.': 18, '_':34}

### Counting the number of capitals

In [11]:
data_copy.dropna(inplace=True)

In [12]:
def capital(z):
    num_digits = 0
    for char in z:
        if char.isupper():
            num_digits += 1
    return num_digits
data_copy['capital'] = data_copy['domain'].apply(lambda x: capital(x))

### Creating an encode_fqdn function to lookup from the dictionary

In [13]:
def encode_fqdn(domain):
    rvalue = list()
    for c in domain:
        rvalue.append(dictionary[c])
    for _ in range(len(rvalue), 64):
        rvalue.insert(0,0)
    return rvalue

### Lowering all the domains as we have created a new column for the number of capitals

In [14]:
data['domain'] = data['domain'].apply(lambda x: x.lower())

### Applying the encode_fqdn function and creating a series for every subset

In [15]:
data['domain'] = data['domain'].apply(lambda x: encode_fqdn(x))

In [16]:
data_1 = data.iloc[:500000,0].apply(lambda x: pd.Series(x))

In [17]:
data_2 = data.iloc[500000:1000000,0].apply(lambda x: pd.Series(x))

In [18]:
data_3 = data.iloc[1000000:1500000,0].apply(lambda x: pd.Series(x))

In [19]:
data_4 = data.iloc[1500000:2000000,0].apply(lambda x: pd.Series(x))

In [20]:
data_5 = data.iloc[2000000:2500000,0].apply(lambda x: pd.Series(x))

In [21]:
data_6 = data.iloc[2500000:3000000,0].apply(lambda x: pd.Series(x))

In [22]:
data_7 = data.iloc[3000000:,0].apply(lambda x: pd.Series(x))

In [23]:
data_concat = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6, data_7], axis = 0)

In [24]:
data_concat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0,0,0,0,0,0,0,0,0,0,...,9,31,8,29,2,24,11,2,1,31
1,0,0,0,0,0,0,0,0,0,0,...,15,37,12,26,24,22,11,15,5,37
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,13,5,11,31
3,0,0,0,0,0,0,0,0,0,0,...,28,6,15,28,13,10,11,30,30,8
4,0,0,0,0,0,0,0,0,0,0,...,5,24,24,22,15,11,7,24,11,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3908520,0,0,0,0,0,0,0,0,0,0,...,16,5,1,12,16,20,20,10,20,17
3908521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,11,16,13,39,35,24,7
3908522,0,0,0,0,0,0,0,0,0,0,...,22,22,6,30,30,15,24,37,24,25
3908523,0,0,0,0,0,0,0,0,0,0,...,0,0,17,10,4,10,24,5,13,22


### Adding a new column which stores the length of the domains

In [26]:
data_concat['length'] = data_copy['domain'].apply(lambda x: len(x))

### Converting the labels into numeric values

In [27]:
data_concat['label'] = np.where(data.label == 'benign', 0, 1)

### Copying the capital column from data_copy to data_concat

In [28]:
data_concat['capital'] = data_copy['capital']

### Calculating the number of digits in each domain

In [29]:
def calc_digits(z):
    num_digit = 0
    digits = list('0123456789')
    for char in z:
        if char in digits:
            num_digit += 1
    return num_digit
data_concat['digits'] = data_copy['domain'].apply(lambda x: calc_digits(x))

### Calculating the maximum length of consective consonants

In [30]:
def consecutive_consonants(string):
    is_vowel = lambda char: char in "aAeEiIoOuU"
    best = 0
    listnames = ["".join(g) for v, g in groupby(string, key=is_vowel) if not v]
    for index in range(len(listnames)):
        if len(listnames[index]) > best:
            best = len(listnames[index])
    return best
data_concat['consonants_consec'] = data_copy['domain'].apply(lambda x: consecutive_consonants(x))

### Dividing the count of vowels by length

In [31]:
def calc_vowels(y):
    num_vowel = 0
    vowels = list('aeiou')
    for char in y:
        if char in vowels:
            num_vowel += 1
    return num_vowel
data_concat['vc'] = (data_copy['domain'].apply(lambda x: calc_vowels(x)))/data_concat['length']

### Calculating the entropy for each domain

In [32]:
def calc_entropy(s):
    p, lns = Counter(str(s)), float(len(str(s)))
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())
data_concat['entropy'] = data_copy['domain'].apply(lambda x: calc_entropy(x))

### Calculating number of unique characters for each domain

In [33]:
def unique_char(x):
    return len(''.join(set(x)))
data_concat['unique'] = data_copy['domain'].apply(lambda x: unique_char(x))

In [34]:
data_concat

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,length,label,capital,digits,consonants_consec,vc,entropy,unique
0,0,0,0,0,0,0,0,0,0,0,...,1,31,13,1,0,3,11,0.076923,3.392747,11
1,0,0,0,0,0,0,0,0,0,0,...,5,37,10,0,0,0,2,0.300000,2.921928,8
2,0,0,0,0,0,0,0,0,0,0,...,11,31,4,0,0,1,2,0.250000,2.000000,4
3,0,0,0,0,0,0,0,0,0,0,...,30,8,10,1,0,0,10,0.000000,2.921928,8
4,0,0,0,0,0,0,0,0,0,0,...,11,7,23,1,0,0,2,0.391304,3.381620,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3908520,0,0,0,0,0,0,0,0,0,0,...,20,17,15,1,0,0,5,0.400000,2.923231,9
3908521,0,0,0,0,0,0,0,0,0,0,...,24,7,7,0,0,0,4,0.285714,2.807355,7
3908522,0,0,0,0,0,0,0,0,0,0,...,24,25,11,1,0,0,8,0.181818,2.732159,7
3908523,0,0,0,0,0,0,0,0,0,0,...,13,22,8,1,0,0,5,0.250000,2.750000,7


### Splitting the data into train and test

In [35]:
seed = 1142
test_size = 0.20
X_train, X_test, y_train, y_test = train_test_split(data_concat.loc[:,data_concat.columns!='label'], data_concat['label'], test_size=test_size, random_state=seed)

### Fitting the XGBoost Model with hyperparameters

In [41]:
model = XGBClassifier(eta = 0.2, silent = 1, max_depth = 8, gamma = 0.2, subsample = 0.4, objective = 'binary:logistic', n_estimators = 1000)
model.fit(X_train, y_train)
print(model)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.2, gamma=0.2,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=1, subsample=0.4, verbosity=1)


### Evaluating the model on validation data

In [42]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.12%


### Evaluating model on test data (I)

In [43]:
testdata1 = pd.read_csv('testdata1.csv')

In [44]:
testdata1['domain'] = testdata1['domain'].apply(lambda x: tldextract.extract(x).domain)
testdata1['length'] = testdata1['domain'].apply(lambda x: len(x))
testdata1['capital'] = testdata1['domain'].apply(lambda x: capital(x))
testdata1['digits'] = testdata1['domain'].apply(lambda x: calc_digits(x))
testdata1['consonants_consec'] = testdata1['domain'].apply(lambda x: consecutive_consonants(x))
testdata1['vc'] = (testdata1['domain'].apply(lambda x: calc_vowels(x)))/testdata1['length']
testdata1['entropy'] = testdata1['domain'].apply(lambda x: calc_entropy(x))
testdata1['unique'] = testdata1['domain'].apply(lambda x: unique_char(x))
testdata1['domain'] = testdata1['domain'].apply(lambda x: encode_fqdn(x))

In [45]:
testdata1vec = testdata1.domain.apply(lambda x: pd.Series(x))
testdata1vec['label'] = np.where(testdata1.label == 'benign', 0, 1)
testdata1vec['length'] = testdata1['length']
testdata1vec['capital'] = testdata1['capital']
testdata1vec['digits'] = testdata1['digits']
testdata1vec['consonants_consec'] = testdata1['consonants_consec']
testdata1vec['vc'] = testdata1['vc']
testdata1vec['entropy'] = testdata1['entropy']
testdata1vec['unique'] = testdata1['unique']

In [46]:
testdata1vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62,63,label,length,capital,digits,consonants_consec,vc,entropy,unique
0,0,0,0,0,0,0,0,0,0,0,...,29,36,1,11,0,3,6,0.090909,3.459432,11
1,0,0,0,0,0,0,0,0,0,0,...,10,7,1,29,0,0,4,0.482759,3.417483,11
2,0,0,0,0,0,0,0,0,0,0,...,30,5,1,23,0,4,10,0.130435,4.055958,18
3,0,0,0,0,0,0,0,0,0,0,...,8,8,1,8,0,0,4,0.250000,2.500000,6
4,0,0,0,0,0,0,0,0,0,0,...,7,37,0,5,0,0,1,0.600000,1.370951,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,...,15,12,1,14,0,0,4,0.285714,3.521641,12
496,0,0,0,0,0,0,0,0,0,0,...,16,1,1,16,0,0,10,0.125000,3.375000,11
497,0,0,0,0,0,0,0,0,0,0,...,9,22,1,8,0,0,7,0.125000,3.000000,8
498,0,0,0,0,0,0,0,0,0,0,...,8,1,1,18,0,0,3,0.388889,2.974938,9


In [47]:
testdata1vec_x = testdata1vec.loc[:,testdata1vec.columns!='label']
testdata1vec_y = testdata1vec['label']

In [49]:
predicted = model.predict(testdata1vec_x)
predictions = [round(value) for value in predicted]
accuracy = accuracy_score(testdata1vec_y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.00%


### Evaluating model on test data (II)

In [50]:
testdata2 = pd.read_csv('testdata2.csv')

In [51]:
testdata2

Unnamed: 0,domain,label
0,www.myuyikkuwuumaoasuiwmqqwsa.com,dga
1,www.kezyxojyfenelo.com,dga
2,www.bezlakieru.com,benign
3,www.engineeringexchange.com,benign
4,www.goqahabemonohawav.com,dga
...,...,...
495,www.kugaquayiukkmkc.com,dga
496,www.sun-glo.com,benign
497,www.weewddtlrlv.com,dga
498,www.cmqmlane.com,dga


In [52]:
testdata2['domain'] = testdata2['domain'].apply(lambda x: tldextract.extract(x).domain)
testdata2['length'] = testdata2['domain'].apply(lambda x: len(x))
testdata2['capital'] = testdata2['domain'].apply(lambda x: capital(x))
testdata2['digits'] = testdata2['domain'].apply(lambda x: calc_digits(x))
testdata2['consonants_consec'] = testdata2['domain'].apply(lambda x: consecutive_consonants(x))
testdata2['vc'] = (testdata2['domain'].apply(lambda x: calc_vowels(x)))/testdata2['length']
testdata2['entropy'] = testdata2['domain'].apply(lambda x: calc_entropy(x))
testdata2['unique'] = testdata2['domain'].apply(lambda x: unique_char(x))
testdata2['domain'] = testdata2['domain'].apply(lambda x: encode_fqdn(x))

In [53]:
testdata2vec = testdata2.domain.apply(lambda x: pd.Series(x))
testdata2vec['label'] = np.where(testdata2.label == 'benign', 0, 1)
testdata2vec['length'] = testdata2['length']
testdata2vec['capital'] = testdata2['capital']
testdata2vec['digits'] = testdata2['digits']
testdata2vec['consonants_consec'] = testdata2['consonants_consec']
testdata2vec['vc'] = testdata2['vc']
testdata2vec['entropy'] = testdata2['entropy']
testdata2vec['unique'] = testdata2['unique']

In [54]:
testdata2vec_x = testdata2vec.loc[:, testdata2vec.columns!='label']
testdata2vec_y = testdata2vec['label']

In [56]:
predicted = model.predict(testdata2vec_x)
predictions = [round(value) for value in predicted]
accuracy = accuracy_score(testdata2vec_y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 93.60%


### Evaluating model on test data (III)

In [57]:
testdata3 = pd.read_csv('testdata3.csv')

In [58]:
testdata3

Unnamed: 0,domain,label
0,australianwoodenboatfestival,benign
1,qkdccn,dga
2,cewesckwaqc,dga
3,jknwld,dga
4,cyneugril,dga
...,...,...
150,lkaerooclqrq,dga
151,sousleground,benign
152,vtrpotwfkapasmgdi,dga
153,prmysangh,dga


In [59]:
testdata3['domain'] = testdata3['domain'].apply(lambda x: tldextract.extract(x).domain)
testdata3['length'] = testdata3['domain'].apply(lambda x: len(x))
testdata3['capital'] = testdata3['domain'].apply(lambda x: capital(x))
testdata3['digits'] = testdata3['domain'].apply(lambda x: calc_digits(x))
testdata3['consonants_consec'] = testdata3['domain'].apply(lambda x: consecutive_consonants(x))
testdata3['vc'] = (testdata3['domain'].apply(lambda x: calc_vowels(x)))/testdata3['length']
testdata3['entropy'] = testdata3['domain'].apply(lambda x: calc_entropy(x))
testdata3['unique'] = testdata3['domain'].apply(lambda x: unique_char(x))
testdata3['domain'] = testdata3['domain'].apply(lambda x: encode_fqdn(x))

In [60]:
testdata3vec = testdata3.domain.apply(lambda x: pd.Series(x))
testdata3vec['label'] = np.where(testdata3.label == 'benign', 0, 1)
testdata3vec['length'] = testdata3['length']
testdata3vec['capital'] = testdata3['capital']
testdata3vec['digits'] = testdata3['digits']
testdata3vec['consonants_consec'] = testdata3['consonants_consec']
testdata3vec['vc'] = testdata3['vc']
testdata3vec['entropy'] = testdata3['entropy']
testdata3vec['unique'] = testdata3['unique']

In [61]:
testdata3vec_x = testdata3vec.loc[:, testdata3vec.columns!='label']
testdata3vec_y = testdata3vec['label']

In [63]:
predicted = model.predict(testdata3vec_x)
predictions = [round(value) for value in predicted]
accuracy = accuracy_score(testdata3vec_y, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 74.84%


In [65]:
model_file_name = "xgboost-model-vvrao"
model._Booster.save_model(model_file_name)

In [66]:
!tar czvf modelvvrao.tar.gz $model_file_name

a xgboost-model-vvrao
