-
Notifications
You must be signed in to change notification settings - Fork 301
/
dga_model_gen.py
335 lines (256 loc) · 13.6 KB
/
dga_model_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
''' Build models to detect Algorithmically Generated Domain Names (DGA).
We're trying to classify domains as being 'legit' or having a high probability
of being generated by a DGA (Dynamic Generation Algorithm). We have 'legit' in
quotes as we're using the domains in Alexa as the 'legit' set.
'''
import os, sys
import traceback
import json
import optparse
import pickle
import collections
import sklearn
import sklearn.feature_extraction
import sklearn.ensemble
import sklearn.metrics
import pandas as pd
import numpy as np
import tldextract
import math
# Version printing is always a good idea
print 'Scikit Learn version: %s' % sklearn.__version__
print 'Pandas version: %s' % pd.__version__
print 'TLDExtract version: %s' % tldextract.__version__
# Version 0.12.0 of Pandas has a DeprecationWarning about Height blah that I'm ignoring
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Okay for this model we need the 2LD and nothing else
def domain_extract(uri):
ext = tldextract.extract(uri)
if (not ext.suffix):
return None
else:
return ext.domain
# Entropy calc (this must match model_eval)
def entropy(s):
p, lns = collections.Counter(s), float(len(s))
return -sum( count/lns * math.log(count/lns, 2) for count in p.values())
def show_cm(cm, labels):
# Compute percentanges
percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T) # Derp, I'm sure there's a better way
print 'Confusion Matrix Stats'
for i, label_i in enumerate(labels):
for j, label_j in enumerate(labels):
print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())
def save_model_to_disk(name, model, model_dir='models'):
''' Serialize and save a model to disk'''
# First serialized the model
serialized_model = pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL)
# Model directory + model name
model_path = os.path.join(model_dir, name+'.model')
# Now store it to disk
print 'Storing Serialized Model to Disk (%s:%.2fMeg)' % (name, len(serialized_model)/1024.0/1024.0)
open(model_path,'wb').write(serialized_model)
def load_model_from_disk(name, model_dir='models'):
# Model directory is relative to this file
model_path = os.path.join(model_dir, name+'.model')
# Put a try/except around the model load in case it fails
try:
model = pickle.loads(open(model_path,'rb').read())
except:
print 'Could not load model: %s from directory %s!' % (name, model_path)
return None
return model
def main():
''' Main method, takes care of loading data, running it through the various analyses
and reporting the results
'''
# Handle command-line arguments
parser = optparse.OptionParser()
parser.add_option('--alexa-file', default='data/alexa_100k.csv', help='Alexa file to pull from. Default: %default')
(options, arguments) = parser.parse_args()
print options, arguments
try: # Pokemon exception handling
# This is the Alexa 1M domain list.
print 'Loading alexa dataframe...'
alexa_dataframe = pd.read_csv(options.alexa_file, names=['rank','uri'], header=None, encoding='utf-8')
print alexa_dataframe.info()
print alexa_dataframe.head()
# Compute the 2LD of the domain given by Alexa
alexa_dataframe['domain'] = [ domain_extract(uri) for uri in alexa_dataframe['uri']]
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
print alexa_dataframe.head()
# Set the class
alexa_dataframe['class'] = 'legit'
# Shuffle the data (important for training/testing)
alexa_dataframe = alexa_dataframe.reindex(np.random.permutation(alexa_dataframe.index))
alexa_total = alexa_dataframe.shape[0]
print 'Total Alexa domains %d' % alexa_total
# Read in the DGA domains
dga_dataframe = pd.read_csv('data/dga_domains.txt', names=['raw_domain'], header=None, encoding='utf-8')
# We noticed that the blacklist values just differ by captilization or .com/.org/.info
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())
del dga_dataframe['raw_domain']
# It's possible we have NaNs from blanklines or whatever
dga_dataframe = dga_dataframe.dropna()
dga_dataframe = dga_dataframe.drop_duplicates()
dga_total = dga_dataframe.shape[0]
print 'Total DGA domains %d' % dga_total
# Set the class
dga_dataframe['class'] = 'dga'
print 'Number of DGA domains: %d' % dga_dataframe.shape[0]
print dga_dataframe.head()
# Concatenate the domains in a big pile!
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)
# Add a length field for the domain
all_domains['length'] = [len(x) for x in all_domains['domain']]
# Okay since we're trying to detect dynamically generated domains and short
# domains (length <=6) are crazy random even for 'legit' domains we're going
# to punt on short domains (perhaps just white/black list for short domains?)
all_domains = all_domains[all_domains['length'] > 6]
# Add a entropy field for the domain
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]
print all_domains.head()
# Now we compute NGrams for every Alexa domain and see if we can use the
# NGrams to help us better differentiate and mark DGA domains...
# Scikit learn has a nice NGram generator that can generate either char NGrams or word NGrams (we're using char).
# Parameters:
# - ngram_range=(3,5) # Give me all ngrams of length 3, 4, and 5
# - min_df=1e-4 # Minimumum document frequency. At 1e-4 we're saying give us NGrams that
# # happen in at least .1% of the domains (so for 100k... at least 100 domains)
alexa_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-4, max_df=1.0)
# I'm SURE there's a better way to store all the counts but not sure...
# At least the min_df parameters has already done some thresholding
counts_matrix = alexa_vc.fit_transform(alexa_dataframe['domain'])
alexa_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = alexa_vc.get_feature_names()
# For fun sort it and show it
import operator
_sorted_ngrams = sorted(zip(ngrams_list, alexa_counts), key=operator.itemgetter(1), reverse=True)
print 'Alexa NGrams: %d' % len(_sorted_ngrams)
for ngram, count in _sorted_ngrams[:10]:
print ngram, count
# We're also going to throw in a bunch of dictionary words
word_dataframe = pd.read_csv('data/words.txt', names=['word'], header=None, dtype={'word': np.str}, encoding='utf-8')
# Cleanup words from dictionary
word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
print word_dataframe.head(10)
# Now compute NGrams on the dictionary words
# Same logic as above...
dict_vc = sklearn.feature_extraction.text.CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
counts_matrix = dict_vc.fit_transform(word_dataframe['word'])
dict_counts = np.log10(counts_matrix.sum(axis=0).getA1())
ngrams_list = dict_vc.get_feature_names()
# For fun sort it and show it
import operator
_sorted_ngrams = sorted(zip(ngrams_list, dict_counts), key=operator.itemgetter(1), reverse=True)
print 'Word NGrams: %d' % len(_sorted_ngrams)
for ngram, count in _sorted_ngrams[:10]:
print ngram, count
# We use the transform method of the CountVectorizer to form a vector
# of ngrams contained in the domain, that vector is than multiplied
# by the counts vector (which is a column sum of the count matrix).
def ngram_count(domain):
alexa_match = alexa_counts * alexa_vc.transform([domain]).T # Woot vector multiply and transpose Woo Hoo!
dict_match = dict_counts * dict_vc.transform([domain]).T
print '%s Alexa match:%d Dict match: %d' % (domain, alexa_match, dict_match)
# Examples:
ngram_count('google')
ngram_count('facebook')
ngram_count('1cb8a5f36f')
ngram_count('pterodactylfarts')
ngram_count('ptes9dro-dwacty2lfa5rrts')
ngram_count('beyonce')
ngram_count('bey666on4ce')
# Compute NGram matches for all the domains and add to our dataframe
all_domains['alexa_grams']= alexa_counts * alexa_vc.transform(all_domains['domain']).T
all_domains['word_grams']= dict_counts * dict_vc.transform(all_domains['domain']).T
print all_domains.head()
# Use the vectorized operations of the dataframe to investigate differences
# between the alexa and word grams
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
# The table below shows those domain names that are more 'dictionary' and less 'web'
print all_domains.sort(['diff'], ascending=True).head(10)
# The table below shows those domain names that are more 'web' and less 'dictionary'
# Good O' web....
print all_domains.sort(['diff'], ascending=False).head(50)
# Lets look at which Legit domains are scoring low on both alexa and word gram count
weird_cond = (all_domains['class']=='legit') & (all_domains['word_grams']<3) & (all_domains['alexa_grams']<2)
weird = all_domains[weird_cond]
print weird.shape[0]
print weird.head(10)
# Epiphany... Alexa really may not be the best 'exemplar' set...
# (probably a no-shit moment for everyone else :)
#
# Discussion: If you're using these as exemplars of NOT DGA, then your probably
# making things very hard on your machine learning algorithm.
# Perhaps we should have two categories of Alexa domains, 'legit'
# and a 'weird'. based on some definition of weird.
# Looking at the entries above... we have approx 80 domains
# that we're going to mark as 'weird'.
#
all_domains.loc[weird_cond, 'class'] = 'weird'
print all_domains['class'].value_counts()
all_domains[all_domains['class'] == 'weird'].head()
# Perhaps we will just exclude the weird class from our ML training
not_weird = all_domains[all_domains['class'] != 'weird']
X = not_weird.as_matrix(['length', 'entropy', 'alexa_grams', 'word_grams'])
# Labels (scikit learn uses 'y' for classification labels)
y = np.array(not_weird['class'].tolist())
# Random Forest is a popular ensemble machine learning classifier.
# http://scikit-learn.org/dev/modules/generated/sklearn.ensemble.RandomForestClassifier.html
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=20, compute_importances=True) # Trees in the forest
# Train on a 80/20 split
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# Now plot the results of the holdout set in a confusion matrix
labels = ['legit', 'dga']
cm = sklearn.metrics.confusion_matrix(y_test, y_pred, labels)
show_cm(cm, labels)
# We can also look at what features the learning algorithm thought were the most important
importances = zip(['length', 'entropy', 'alexa_grams', 'word_grams'], clf.feature_importances_)
print importances
# Now train on the whole thing before doing tests and saving models to disk
clf.fit(X, y)
# test_it shows how to do evaluation, also fun for manual testing below :)
def test_it(domain):
_alexa_match = alexa_counts * alexa_vc.transform([domain]).T # Woot matrix multiply and transpose Woo Hoo!
_dict_match = dict_counts * dict_vc.transform([domain]).T
_X = [len(domain), entropy(domain), _alexa_match, _dict_match]
print '%s : %s' % (domain, clf.predict(_X)[0])
# Examples (feel free to change these and see the results!)
test_it('google')
test_it('google88')
test_it('facebook')
test_it('1cb8a5f36f')
test_it('pterodactylfarts')
test_it('ptes9dro-dwacty2lfa5rrts')
test_it('beyonce')
test_it('bey666on4ce')
test_it('supersexy')
test_it('yourmomissohotinthesummertime')
test_it('35-sdf-09jq43r')
test_it('clicksecurity')
# Serialize model to disk
save_model_to_disk('dga_model_random_forest', clf)
save_model_to_disk('dga_model_alexa_vectorizor', alexa_vc)
save_model_to_disk('dga_model_alexa_counts', alexa_counts)
save_model_to_disk('dga_model_dict_vectorizor', dict_vc)
save_model_to_disk('dga_model_dict_counts', dict_counts)
except KeyboardInterrupt:
print 'Goodbye Cruel World...'
sys.exit(0)
except Exception, error:
traceback.print_exc()
print '(Exception):, %s' % (str(error))
sys.exit(1)
if __name__ == '__main__':
main()