This repository has been archived by the owner on Mar 15, 2021. It is now read-only.
/
pe_sim_graph.py
161 lines (126 loc) · 5.97 KB
/
pe_sim_graph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
"""This client generates a similarity graph from features in PE Files."""
import zerorpc
import os
import workbench_client
def add_it(workbench, file_list, labels):
"""Add the given file_list to workbench as samples, also add them as nodes.
Args:
workbench: Instance of Workbench Client.
file_list: list of files.
labels: labels for the nodes.
Returns:
A list of md5s.
"""
md5s = []
for filename in file_list:
if filename != '.DS_Store':
with open(filename, 'rb') as pe_file:
base_name = os.path.basename(filename)
md5 = workbench.store_sample(base_name, pe_file.read(), 'exe')
workbench.add_node(md5, md5[:6], labels)
md5s.append(md5)
return md5s
def jaccard_sims(feature_list):
"""Compute Jaccard similarities between all the observations in the feature list.
Args:
feature_list: a list of dictionaries, each having structure as
{ 'md5' : String, 'features': list of Strings }
Returns:
list of dictionaries with structure as
{'source': md5 String, 'target': md5 String, 'sim': Jaccard similarity Number}
"""
sim_info_list = []
for feature_info in feature_list:
md5_source = feature_info['md5']
features_source = feature_info['features']
for feature_info in feature_list:
md5_target = feature_info['md5']
features_target = feature_info['features']
if md5_source == md5_target:
continue
sim = jaccard_sim(features_source, features_target)
if sim > .5:
sim_info_list.append({'source': md5_source, 'target': md5_target, 'sim': sim})
return sim_info_list
def jaccard_sim(features1, features2):
"""Compute similarity between two sets using Jaccard similarity.
Args:
features1: list of PE Symbols.
features2: list of PE Symbols.
Returns:
Returns an int.
"""
set1 = set(features1)
set2 = set(features2)
try:
return len(set1.intersection(set2))/float(max(len(set1), len(set2)))
except ZeroDivisionError:
return 0
def run():
"""This client generates a similarity graph from features in PE Files."""
# Grab server args
args = workbench_client.grab_server_args()
# Start up workbench connection
workbench = zerorpc.Client(timeout=300, heartbeat=60)
workbench.connect('tcp://'+args['server']+':'+args['port'])
# Test out PEFile -> pe_deep_sim -> pe_jaccard_sim -> graph
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../data/pe/bad')
bad_files = [os.path.join(data_path, child) for child in os.listdir(data_path)][:5]
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)),'../data/pe/good')
good_files = [os.path.join(data_path, child) for child in os.listdir(data_path)][:5]
# Clear any graph in the Neo4j database
workbench.clear_graph_db()
# First throw them into workbench and add them as nodes into the graph
all_md5s = add_it(workbench, bad_files, ['exe', 'bad']) + add_it(workbench, good_files, ['exe', 'good'])
# Compute pe_features on all files of type pe, just pull back the sparse features
import_gen = workbench.batch_work_request('pe_features',
{'md5_list': all_md5s, 'subkeys':['md5', 'sparse_features.imported_symbols']})
imports = [{'md5': r['md5'], 'features': r['imported_symbols']} for r in import_gen]
# Compute pe_features on all files of type pe, just pull back the sparse features
warning_gen = workbench.batch_work_request('pe_features',
{'md5_list': all_md5s, 'subkeys':['md5', 'sparse_features.pe_warning_strings']})
warnings = [{'md5': r['md5'], 'features': r['pe_warning_strings']} for r in warning_gen]
# Compute strings on all files of type pe, just pull back the string_list
string_gen = workbench.batch_work_request('strings', {'md5_list': all_md5s, 'subkeys':['md5', 'string_list']})
strings = [{'md5': r['md5'], 'features': r['string_list']} for r in string_gen]
# Compute pe_peid on all files of type pe, just pull back the match_list
# Fixme: commenting this out until we figure out why peid is SO slow
'''
peid_gen = workbench.batch_work_request('pe_peid', {'md5_list': all_md5s, 'subkeys':['md5', 'match_list']})
peids = [{'md5': r['md5'], 'features': r['match_list']} for r in peid_gen]
'''
# Compute the Jaccard Index between imported systems and store as relationships
sims = jaccard_sims(imports)
for sim_info in sims:
workbench.add_rel(sim_info['source'], sim_info['target'], 'imports')
# Compute the Jaccard Index between warnings and store as relationships
sims = jaccard_sims(warnings)
for sim_info in sims:
workbench.add_rel(sim_info['source'], sim_info['target'], 'warnings')
# Compute the Jaccard Index between strings and store as relationships
sims = jaccard_sims(strings)
for sim_info in sims:
workbench.add_rel(sim_info['source'], sim_info['target'], 'strings')
# Compute the Jaccard Index between peids and store as relationships
# Fixme: commenting this out until we figure out why peid is SO slow
'''
sims = jaccard_sims(peids)
for sim_info in sims:
workbench.add_rel(sim_info['source'], sim_info['target'], 'peids')
'''
# Compute pe_deep_sim on all files of type pe
results = workbench.batch_work_request('pe_deep_sim', {'type_tag': 'exe'})
# Store the ssdeep sims as relationships
for result in list(results):
for sim_info in result['sim_list']:
workbench.add_rel(result['md5'], sim_info['md5'], 'ssdeep')
# Let them know where they can get there graph
print 'All done: go to http://localhost:7474/browser and execute this query: "%s"' % \
('match (n)-[r]-() return n,r')
import pytest
@pytest.mark.xfail
def test():
"""Executes pe_sim_graph test."""
run()
if __name__ == '__main__':
run()