In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-Vulnerability creation                                            #
#                                                                            #
##############################################################################

In [2]:
## Requirements
# tqdm==4.64.0
# pandas==1.4.2

In [3]:
import json
import pandas as pd
import itertools

from tqdm import tqdm

**Read the flowchart CSV**

In [4]:
flowchart = pd.read_csv("../../DBs/Dataset-Muaz/features/flowchart_Dataset-Muaz.csv")
print(flowchart.shape)

(2412, 8)


In [5]:
flowchart.head()

Unnamed: 0,idb_path,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes
0,IDBs/Dataset-Muaz/openssl.i64,0x43470,__do_global_dtors_aux,0x43470,0x434a9,5,0x43470;0x4347d;0x4348b;0x43497;0x434a8,7ba892443ac4fab9631507cc72221283b80b73025445fa...
1,IDBs/Dataset-Muaz/openssl.i64,0x435ad,mock_srv_ctx_new,0x435ad,0x4361b,7,0x435ad;0x435d9;0x435f3;0x43604;0x43607;0x4360...,35b3f7e1a77c54606741d8ce4c11a8b060a0a82c1db1bf...
2,IDBs/Dataset-Muaz/openssl.i64,0x4361b,ossl_cmp_mock_srv_set1_refCert,0x4361b,0x436c4,7,0x4361b;0x43642;0x43685;0x4368c;0x4369c;0x436b...,ecc80d570dbd9e6cf8c81241c385e1d827816eafed1a46...
3,IDBs/Dataset-Muaz/openssl.i64,0x436c4,ossl_cmp_mock_srv_set1_certOut,0x436c4,0x4376f,7,0x436c4;0x436eb;0x4372e;0x43735;0x43745;0x4376...,ecc80d570dbd9e6cf8c81241c385e1d827816eafed1a46...
4,IDBs/Dataset-Muaz/openssl.i64,0x4376f,ossl_cmp_mock_srv_set1_chainOut,0x4376f,0x43829,7,0x4376f;0x4379e;0x437e1;0x437e8;0x437ff;0x4380...,2ec40fa09fee375d30a5d6ccb8104c1dbf9cc539ed9d39...


In [6]:
flowchart.groupby(['func_name']).count()

Unnamed: 0_level_0,idb_path,fva,start_ea,end_ea,bb_num,bb_list,hashopcodes
func_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DSA_sign_loop,3,3,3,3,3,3,3
DSA_verify_loop,3,3,3,3,3,3,3
ECDH_EVP_derive_key_loop,3,3,3,3,3,3,3
ECDSA_sign_loop,3,3,3,3,3,3,3
ECDSA_verify_loop,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...
x509_load_serial,3,3,3,3,3,3,3
x509_main,3,3,3,3,3,3,3
x509_req_ctrl,3,3,3,3,3,3,3
x509_to_req,3,3,3,3,3,3,3


In [7]:
flowchart.loc[flowchart['func_name'] == 'getMatrixElements']

Unnamed: 0,idb_path,fva,func_name,start_ea,end_ea,bb_num,bb_list,hashopcodes


**Functions of interest**

In [8]:
fun_of_interest = list(flowchart['func_name'])
fun_of_interest = [
    'slist_wc_append',
    'ssl_srp_verify_param_cb',
    'notef',
    'app_create_libctx',
    'tool_set_stderr'
]
selected_columns = ['idb_path', 'fva', 'func_name', 'hashopcodes']

df0 = flowchart[selected_columns]
df = df0.loc[df0['func_name'].isin(fun_of_interest)]


# Store the new function pairs
df.reset_index(inplace=True)
pairs = list(itertools.combinations(df.index,2))
df = df.drop('index', axis=1)

**Create all pairs of all functions of interest**

In [9]:
comparison_list = list()
print("Nb of fun: {}".format(len(df)))

# Iterate over each unique pair of function in the list

for f1,f2 in tqdm(set(pairs)):
    comparison_list.append(list(df.iloc[f1]) + list(df.iloc[f2]))

Nb of fun: 18


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 153/153 [00:00<00:00, 26308.97it/s]


In [10]:
comparison_list[0]

['IDBs/Dataset-Muaz/openssl.i64',
 '0xc5c77',
 'ssl_srp_verify_param_cb',
 'df990f329828536fdaee29c544ddbc5fae6407842325b926c78405274d6747fc',
 'IDBs/Dataset-Muaz/curl.i64',
 '0x1ba7a',
 'notef',
 '33bccf5039d336dce50f0dbc3dba165e74eaf31508ce829c5bfc41ad03ba234e']

In [11]:
len(comparison_list)

153

In [12]:
# Create a new DataFrame
columns = [x + "_1" for x in selected_columns ] + [x + "_2" for x in selected_columns ]
testing = pd.DataFrame(comparison_list, columns=columns)

# Add the db_type column 
testing['db_type'] = ['XM'] * testing.shape[0]
print(testing.shape)

(153, 9)


In [13]:
# Sort the rows
testing.sort_values(by=['idb_path_1', 'fva_1', 'idb_path_2', 'fva_2'], inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(153, 9)


In [14]:
# Check that the hashopcodes of the functions to compare are different
for i, row in testing.iterrows():
    if row['hashopcodes_1'] == row['hashopcodes_2']:
        print("MATCH!")
        print(row)

MATCH!
idb_path_1                              IDBs/Dataset-Muaz/curl.i64
fva_1                                                      0x1ba7a
func_name_1                                                  notef
hashopcodes_1    33bccf5039d336dce50f0dbc3dba165e74eaf31508ce82...
idb_path_2                              IDBs/Dataset-Muaz/curl.i64
fva_2                                                      0x1ba7a
func_name_2                                                  notef
hashopcodes_2    33bccf5039d336dce50f0dbc3dba165e74eaf31508ce82...
db_type                                                         XM
Name: 0, dtype: object
MATCH!
idb_path_1                              IDBs/Dataset-Muaz/curl.i64
fva_1                                                      0x1ba7a
func_name_1                                                  notef
hashopcodes_1    33bccf5039d336dce50f0dbc3dba165e74eaf31508ce82...
idb_path_2                              IDBs/Dataset-Muaz/curl.i64
fva_2                    

In [15]:
# Paranoid check
testing.drop_duplicates(inplace=True)
testing.reset_index(inplace=True, drop=True)
print(testing.shape)

(44, 9)


In [16]:
# Remove hashopcodes columns
del testing['hashopcodes_1']
del testing['hashopcodes_2']

In [17]:
testing.head()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
0,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,XM
1,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,IDBs/Dataset-Muaz/curl.i64,0xb9fd,slist_wc_append,XM
2,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,IDBs/Dataset-Muaz/curl_obf.i64,0x1baa4,notef,XM
3,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,IDBs/Dataset-Muaz/curl_obf.i64,0x1baa9,notef,XM
4,IDBs/Dataset-Muaz/curl.i64,0x1ba7a,notef,IDBs/Dataset-Muaz/curl_obf.i64,0xb9fd,slist_wc_append,XM


In [18]:
testing.tail()

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
39,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,IDBs/Dataset-Muaz/curl_obf.i64,0x1baa4,notef,XM
40,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,IDBs/Dataset-Muaz/curl_obf.i64,0x1baa9,notef,XM
41,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,IDBs/Dataset-Muaz/curl_obf.i64,0xb9fd,slist_wc_append,XM
42,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,IDBs/Dataset-Muaz/openssl.i64,0xb2fc5,app_create_libctx,XM
43,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,IDBs/Dataset-Muaz/openssl.i64,0xc5c77,ssl_srp_verify_param_cb,XM


In [19]:
testing.groupby(['idb_path_1', 'func_name_1']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,fva_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,func_name_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/curl.i64,notef,7,7,7,7,7
IDBs/Dataset-Muaz/curl.i64,slist_wc_append,7,7,7,7,7
IDBs/Dataset-Muaz/curl_obf.i64,notef,9,9,9,9,9
IDBs/Dataset-Muaz/curl_obf.i64,slist_wc_append,7,7,7,7,7
IDBs/Dataset-Muaz/openssl.i64,app_create_libctx,7,7,7,7,7
IDBs/Dataset-Muaz/openssl.i64,ssl_srp_verify_param_cb,7,7,7,7,7


In [20]:
testing.groupby(['idb_path_1']).count()

Unnamed: 0_level_0,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
idb_path_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/curl.i64,14,14,14,14,14,14
IDBs/Dataset-Muaz/curl_obf.i64,16,16,16,16,16,16
IDBs/Dataset-Muaz/openssl.i64,14,14,14,14,14,14


In [21]:
testing.groupby(['idb_path_2']).count()

Unnamed: 0_level_0,idb_path_1,fva_1,func_name_1,fva_2,func_name_2,db_type
idb_path_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
IDBs/Dataset-Muaz/curl.i64,14,14,14,14,14,14
IDBs/Dataset-Muaz/curl_obf.i64,18,18,18,18,18,18
IDBs/Dataset-Muaz/openssl.i64,12,12,12,12,12,12


In [22]:
# Save the DataFrame to file
testing.to_csv("../../DBs/Dataset-Muaz/pairs/pairs_testing_Dataset-Muaz.csv")

In [23]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.

testing_functions = set([tuple(x) for x in testing[['idb_path_1', 'fva_1']].values])
testing_functions |= set([tuple(x) for x in testing[['idb_path_2', 'fva_2']].values])
print("Found {} unique functions".format(len(testing_functions)))

from collections import defaultdict
selected_functions = defaultdict(list)
for t in testing_functions:
    selected_functions[t[0]].append(int(t[1], 16))

# Test
assert(sum([len(v) for v in selected_functions.values()]) == len(testing_functions))

# Save to file
with open("../../DBs/Dataset-Muaz/features/selected_Dataset-Muaz.json", "w") as f_out:
    json.dump(selected_functions, f_out)

Found 7 unique functions


In [24]:
# Save the "selected functions" to a CSV.
# This will be useful to post-process the results.

# Remove from flowchart the functions that are not used for the testing
dataset = flowchart.copy()
del dataset['bb_list']
del_list = list()
for i, row in dataset.iterrows():
    if not tuple([row['idb_path'], row['fva']]) in testing_functions:
        del_list.append(i)
dataset.drop(del_list, inplace=True)
dataset.reset_index(inplace=True, drop=True)
print(dataset.shape)

# Save to file
dataset.to_csv("../../DBs/Dataset-Muaz/testing_Dataset-Muaz.csv")

(18, 7)
