In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-Vulnerability creation                                            #
#                                                                            #
##############################################################################

In [2]:
## Requirements
# tqdm==4.64.0
# pandas==1.4.2

In [3]:
import json
import pandas as pd

from tqdm import tqdm

**Read the flowchart CSV**

In [4]:
flowchart = pd.read_csv("../../DBs/Dataset-Muaz/features/flowchart_Dataset-Muaz.csv")
print(flowchart.shape)

(54, 8)


In [5]:
flowchart.head()

Unnamed: 0,idb_path,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes
0,IDBs/Dataset-Muaz/ssort_sum.i64,0x1180,__do_global_dtors_aux,0x1180,0x11b9,5,0x1180;0x118d;0x119b;0x11a7;0x11b8,7ba892443ac4fab9631507cc72221283b80b73025445fa...
1,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,0x11fa,0x1349,12,0x11fa;0x1217;0x1223;0x1234;0x1264;0x126a;0x12...,1ba8d45d00bfc572df9b4d53850f21867287412635c202...
2,IDBs/Dataset-Muaz/bub_sum.i64,0x1180,__do_global_dtors_aux,0x1180,0x11b9,5,0x1180;0x118d;0x119b;0x11a7;0x11b8,7ba892443ac4fab9631507cc72221283b80b73025445fa...
3,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,0x11fa,0x1349,12,0x11fa;0x1217;0x1223;0x122c;0x1260;0x1297;0x12...,78320e1cfd99824098b47fab39ea3a12e840a533ddd292...
4,IDBs/Dataset-Muaz/matmul_og.i64,0x1180,__do_global_dtors_aux,0x1180,0x11b9,5,0x1180;0x118d;0x119b;0x11a7;0x11b8,7ba892443ac4fab9631507cc72221283b80b73025445fa...


In [6]:
flowchart.groupby(['idb_path']).count()

Unnamed: 0_level_0,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes
idb_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
IDBs/Dataset-Muaz/bub_sum.i64,6,6,6,6,6,6,6
IDBs/Dataset-Muaz/bubsort_og.i64,6,6,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_og.i64,12,12,12,12,12,12,12
IDBs/Dataset-Muaz/matmul_sum.i64,15,15,15,15,15,15,15
IDBs/Dataset-Muaz/ssort_og.i64,6,6,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_sum.i64,6,6,6,6,6,6,6
IDBs/Dataset-Muaz/sum_og.i64,3,3,3,3,3,3,3


**Functions of interest**

In [7]:
fun_of_interest = [
    'bubbleSort',               
    'multiplyMatrices',             
    'selectionSort',              
]

**Create all pairs of all functions**

In [8]:
selected_columns = ['idb_path', 'fva', 'func_name', 'hashopcodes']

# Store the new function pairs
comparison_list = list()

# Iterate over each function in the list
# Select the source function
for interest_fun in fun_of_interest:
    left_row = flowchart[(flowchart['func_name'] == interest_fun)]
    for fun in left_row[selected_columns].values:
        left = list(fun)
        # Iterate over the target functions
        for interest_fun2 in fun_of_interest:
            right_indexes = flowchart[(flowchart['func_name'] == interest_fun2)].index
            for index in right_indexes:
                right = list(flowchart.loc[index,selected_columns].values)
                comparison_list.append(left+right)

In [9]:
comparison_list

[['IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0',
  'IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0'],
 ['IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0',
  'IDBs/Dataset-Muaz/bubsort_og.i64',
  '0x11da',
  'bubbleSort',
  'ee58e509d405558c0594e5268578be3ac0f6277377ac743715ae601f784931a2'],
 ['IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0',
  'IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0'],
 ['IDBs/Dataset-Muaz/bub_sum.i64',
  '0x11fa',
  'bubbleSort',
  '78320e1cfd99824098b47fab39ea3a12e840a533ddd292733b92d7b790cfd0a0',
  'IDBs/Dataset-Muaz/bubsort_og.i64',
  '0x11da',
  'bubbleSort

In [10]:
len(comparison_list)

324

In [11]:
# Create a new DataFrame
columns = [x + "_1" for x in selected_columns ] + [x + "_2" for x in selected_columns ]
testing = pd.DataFrame(comparison_list, columns=columns)

# Add the db_type column 
testing['db_type'] = ['XM'] * testing.shape[0]
print(testing.shape)

(324, 9)


In [12]:
# Sort the rows
testing.sort_values(by=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2'], inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(324, 9)


In [13]:
# Check that the hashopcodes of the functions to compare are different
for i, row in testing.iterrows():
    if row['hashopcodes_1'] == row['hashopcodes_2']:
        print("MATCH!")
        print(row)

MATCH!
idb_path_1                           IDBs/Dataset-Muaz/bub_sum.i64
fva_1                                                       0x11fa
func_name_1                                             bubbleSort
hashopcodes_1    78320e1cfd99824098b47fab39ea3a12e840a533ddd292...
idb_path_2                           IDBs/Dataset-Muaz/bub_sum.i64
fva_2                                                       0x11fa
func_name_2                                             bubbleSort
hashopcodes_2    78320e1cfd99824098b47fab39ea3a12e840a533ddd292...
db_type                                                         XM
Name: 0, dtype: object
MATCH!
idb_path_1                           IDBs/Dataset-Muaz/bub_sum.i64
fva_1                                                       0x11fa
func_name_1                                             bubbleSort
hashopcodes_1    78320e1cfd99824098b47fab39ea3a12e840a533ddd292...
idb_path_2                           IDBs/Dataset-Muaz/bub_sum.i64
fva_2                    

In [14]:
# Paranoid check
testing.drop_duplicates(inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(36, 9)


In [15]:
# Remove hashopcodes columns
del testing['hashopcodes_1']
del testing['hashopcodes_2']

In [16]:
testing.head()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
0,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,XM
1,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,IDBs/Dataset-Muaz/bubsort_og.i64,0x11da,bubbleSort,XM
2,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,IDBs/Dataset-Muaz/matmul_og.i64,0x11f2,multiplyMatrices,XM
3,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,IDBs/Dataset-Muaz/matmul_sum.i64,0x12a3,multiplyMatrices,XM
4,IDBs/Dataset-Muaz/bub_sum.i64,0x11fa,bubbleSort,IDBs/Dataset-Muaz/ssort_og.i64,0x11da,selectionSort,XM


In [17]:
testing.tail()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
31,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,IDBs/Dataset-Muaz/bubsort_og.i64,0x11da,bubbleSort,XM
32,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,IDBs/Dataset-Muaz/matmul_og.i64,0x11f2,multiplyMatrices,XM
33,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,IDBs/Dataset-Muaz/matmul_sum.i64,0x12a3,multiplyMatrices,XM
34,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,IDBs/Dataset-Muaz/ssort_og.i64,0x11da,selectionSort,XM
35,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,IDBs/Dataset-Muaz/ssort_sum.i64,0x11fa,selectionSort,XM


In [18]:
testing.groupby(['idb_path_1', 'func_name_1']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,fva_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,func_name_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/bub_sum.i64,bubbleSort,6,6,6,6,6
IDBs/Dataset-Muaz/bubsort_og.i64,bubbleSort,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_og.i64,multiplyMatrices,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_sum.i64,multiplyMatrices,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_og.i64,selectionSort,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_sum.i64,selectionSort,6,6,6,6,6


In [19]:
testing.groupby(['idb_path_1']).count()

Unnamed: 0_level_0,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/bub_sum.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/bubsort_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_sum.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_sum.i64,6,6,6,6,6,6


In [20]:
testing.groupby(['idb_path_2']).count()

Unnamed: 0_level_0,idb_path_1,fva_1,func_name_1,fva_2,func_name_2,db_type
idb_path_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/bub_sum.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/bubsort_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/matmul_sum.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_og.i64,6,6,6,6,6,6
IDBs/Dataset-Muaz/ssort_sum.i64,6,6,6,6,6,6


In [21]:
# Save the DataFrame to file
testing.to_csv("../../DBs/Dataset-Muaz/pairs/pairs_testing_Dataset-Muaz.csv")

In [22]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.

testing_functions = set([tuple(x) for x in testing[['idb_path_1', 'fva_1']].values])
testing_functions |= set([tuple(x) for x in testing[['idb_path_2', 'fva_2']].values])
print("Found {} unique functions".format(len(testing_functions)))

from collections import defaultdict
selected_functions = defaultdict(list)
for t in testing_functions:
    selected_functions[t[0]].append(int(t[1], 16))

# Test
assert(sum([len(v) for v in selected_functions.values()]) == len(testing_functions))

# Save to file
with open("../../DBs/Dataset-Muaz/features/selected_Dataset-Muaz.json", "w") as f_out:
    json.dump(selected_functions, f_out)

Found 6 unique functions


In [23]:
# Save the "selected functions" to a CSV.
# This will be useful to post-process the results.

# Remove from flowchart the functions that are not used for the testing
dataset = flowchart.copy()
del dataset['bb_list']
del_list = list()
for i, row in dataset.iterrows():
    if not tuple([row['idb_path'], row['fva']]) in testing_functions:
        del_list.append(i)
dataset.drop(del_list, inplace=True)
dataset.reset_index(inplace=True, drop=True)
print(dataset.shape)

# Save to file
dataset.to_csv("../../DBs/Dataset-Muaz/testing_Dataset-Muaz.csv")

(18, 7)
