In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from slither_sol_helpers import *
import os, json
from tqdm import tqdm
import swifter
from pandarallel import pandarallel
from transformers import AutoTokenizer, RobertaTokenizer
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, T5ForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from datasets import Dataset
pandarallel.initialize(progress_bar=True)


from swifter import set_defaults
set_defaults(
    npartitions=2*os.cpu_count(),
    dask_threshold=1,
    scheduler="processes",
    progress_bar=True,
    progress_bar_desc=None,
    allow_dask_on_strings=False,
    force_parallel=False,
)

partial_dataset_path = '/home/pippertetsing/sourcify_contract_data/partial_match'
full_dataset_path = '/home/pippertetsing/sourcify_contract_data/full_match'
contracts_dirs_saved = './contracts_dirs.pkl'

  from .autonotebook import tqdm as notebook_tqdm


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
# read all contract dirs or load it if already done
# contracts_dirs_partial = check_folder_structure(partial_dataset_path)
# contracts_dirs_full = check_folder_structure(full_dataset_path)
# contracts_dirs = pd.concat([contracts_dirs_full, contracts_dirs_partial])

contracts_dirs = pd.read_pickle(contracts_dirs_saved)

In [None]:
# Filter and retain only dirs wit '''sources''' dirctories
contracts_dirs = contracts_dirs[contracts_dirs.has_src_files == True]
contracts_dirs

In [None]:
sub_set = contracts_dirs.sample(10)
sub_set.iloc[0].contracts_dir

In [None]:
# Small test 
small_set = []
for idx in tqdm(range(len(sub_set))):
    row = sub_set.iloc[idx]
    src_dir = row.contracts_dir + '/sources'
    args = construct_mapping_and_args(row.contracts_dir + "/metadata.json", True, True)

    if get_all_sol_files(src_dir)is None:
        print(f'no solidity file in {src_dir}')
    else:
        for sol_file_cp in get_all_sol_files(src_dir):
            sol_file = sol_file_cp.replace(src_dir, '.')
            cmd = ['slither', sol_file.replace(src_dir, '.')]
            _ = [cmd.append(x) for x in args]
            p = subprocess.run(cmd,
                cwd=src_dir,
                shell=False,                            
                capture_output = True,
                universal_newlines=True)
            
            if p.stdout == '':
                small_set.append({'source_dir':src_dir, 'sol_file':sol_file_cp,
                               'processed':False,
                                'slither':None})
                #print(sol_file, 'process status:', False)
            else:
                output = json.loads(p.stdout)
                #print(sol_file, 'process status:', output['success'])
                small_set.append({'source_dir':src_dir, 'sol_file':sol_file_cp,
                               'processed':output['success'],
                                'slither':get_slither_check_from_json(output)})
                        

In [None]:
def slither_process(df_row):
    
    result = []
    src_dir = df_row.contracts_dir + '/sources'
    args = construct_mapping_and_args(df_row.contracts_dir + "/metadata.json", True, True)

    if get_all_sol_files(src_dir) is None:
        print(f'no solidity file in {src_dir}')
    else:
        for sol_file_cp in get_all_sol_files(src_dir):
            sol_file = sol_file_cp.replace(src_dir, '.')
            cmd = ['slither', sol_file.replace(src_dir, '.')]
            _ = [cmd.append(x) for x in args]
            p = subprocess.run(cmd,
                cwd=src_dir,
                shell=False,                            
                capture_output = True,
                universal_newlines=True)
            
            if p.stdout == '':
                result.append({'source_dir':src_dir, 'sol_file':sol_file_cp,
                                'contracts_dirs':df_row.contracts_dir,
                                'has_src_files': df_row.has_src_files,
                               'slither_processed':False,
                                'slither':None})
                #print(sol_file, 'process status:', False)
            else:
                output = json.loads(p.stdout)
                #print(sol_file, 'process status:', output['success'])
                result.append({'source_dir':src_dir, 'sol_file':sol_file_cp,
                               'slither_processed':output['success'],
                               'contracts_dirs':df_row.contracts_dir,
                               'has_src_files': df_row.has_src_files,
                               'slither':get_slither_check_from_json(output)})
    return pd.DataFrame(result)

sub_set['slither_res'] = sub_set.parallel_apply(slither_process, axis=1) 

In [None]:
dataset = pd.DataFrame()

for _, row in sub_set.iterrows():
    dataset = pd.concat([dataset, row.slither_res])

In [None]:
dataset

In [None]:
sub_set

In [None]:
r = './full_match/11155111/0xbdCB29d71eBCDb7114Cfb8c6d9298B4976D6a608'
arg = construct_mapping_and_args(r + "/metadata.json")
#os.chdir(r + '/sources')

cmd = ['slither', '_openzeppelin/contracts/proxy/Proxy.sol']
for x in arg:
    cmd.append(x)
print(cmd)

p = subprocess.run(cmd,
                cwd=r+'/sources',
                shell=False,                            
                capture_output = True,
                universal_newlines=True)
# p = subprocess.run(['slither', '_openzeppelin/contracts/proxy/Proxy.sol', '--solc-args', '--optimize --optimize-runs 200',
#                         '--solc-solcs-select', '0.8.9', '--exclude-informational', '--exclude-dependencies', '--exclude-optimization'],
#                         cwd=r+'/sources',
#                         shell=False,
#                         capture_output = True,
#                         universal_newlines=True
#                         )
# os.system("slither _openzeppelin/contracts/proxy/Proxy.sol " + arg)

In [None]:
out = json.loads(p.stdout)

In [None]:
detectors.get('abiencoderv2-array').get('idx')


In [3]:
base_model = '../solidity-generator'
raw_sol_data = pd.read_pickle('./slither_processed_contracts_sample100.pkl')

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(data):
    t = tokenizer( data, padding="max_length", truncation=True) #  truncation=True, max_length=16, return_length=True, return_overflowing_tokens=True)
    return {'input_ids': t['input_ids']}

In [4]:
for f in raw_sol_data.sol_file:
    print(f)

/home/pippertetsing/sourcify_contract_data/partial_match/1/0xB1eE0b3494d665C02BB1014Fb25bE0F16Ad802ed/sources/DSProxy.sol
/home/pippertetsing/sourcify_contract_data/full_match/80001/0xdf4cf3b2DCb294Ae2f3B5D4Cef453615cd583620/sources/_openzeppelin/contracts/proxy/Proxy.sol
/home/pippertetsing/sourcify_contract_data/full_match/80001/0xdf4cf3b2DCb294Ae2f3B5D4Cef453615cd583620/sources/_openzeppelin/contracts/proxy/beacon/IBeacon.sol
/home/pippertetsing/sourcify_contract_data/full_match/80001/0xdf4cf3b2DCb294Ae2f3B5D4Cef453615cd583620/sources/_openzeppelin/contracts/proxy/transparent/TransparentUpgradeableProxy.sol
/home/pippertetsing/sourcify_contract_data/full_match/80001/0xdf4cf3b2DCb294Ae2f3B5D4Cef453615cd583620/sources/_openzeppelin/contracts/proxy/ERC1967/ERC1967Upgrade.sol
/home/pippertetsing/sourcify_contract_data/full_match/80001/0xdf4cf3b2DCb294Ae2f3B5D4Cef453615cd583620/sources/_openzeppelin/contracts/proxy/ERC1967/ERC1967Proxy.sol
/home/pippertetsing/sourcify_contract_data/full_

In [5]:
d = get_sol_data(raw_sol_data.sol_file.iloc[0])
d_nc = get_sol_data(raw_sol_data.sol_file.iloc[0], True)


In [6]:
test = tokenize_function(d)

In [7]:
tokenizer.decode(test['input_ids'])

"// proxy.sol - execute actions atomically through the proxy's identity\n\n// Copyright (C) 2017  DappHub, LLC\n\n// This program is free software: you can redistribute it and/or modify\n// it under the terms of the GNU General Public License as published by\n// the Free Software Foundation, either version 3 of the License, or\n// (at your option) any later version.\n\n// This program is distributed in the hope that it will be useful,\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n// GNU General Public License for more details.\n\n// You should have received a copy of the GNU General Public License\n// along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\npragma solidity ^0.4.23;\n\ncontract DSAuthority {\n    function canCall(\n        address src, address dst, bytes4 sig\n    ) public view returns (bool);\n}\n\ncontract DSAuthEvents {\n    event LogSetAuthority (address indexed a

In [8]:
d

"// proxy.sol - execute actions atomically through the proxy's identity\n\n// Copyright (C) 2017  DappHub, LLC\n\n// This program is free software: you can redistribute it and/or modify\n// it under the terms of the GNU General Public License as published by\n// the Free Software Foundation, either version 3 of the License, or\n// (at your option) any later version.\n\n// This program is distributed in the hope that it will be useful,\n// but WITHOUT ANY WARRANTY; without even the implied warranty of\n// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n// GNU General Public License for more details.\n\n// You should have received a copy of the GNU General Public License\n// along with this program.  If not, see <http://www.gnu.org/licenses/>.\n\npragma solidity ^0.4.23;\n\ncontract DSAuthority {\n    function canCall(\n        address src, address dst, bytes4 sig\n    ) public view returns (bool);\n}\n\ncontract DSAuthEvents {\n    event LogSetAuthority (address indexed a

In [9]:
hf = Dataset.from_pandas(raw_sol_data)

In [10]:
block_size = 128
context_length = 32

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


def tokenize_function(data):
    # get rid of comments in source files
    t = tokenizer([get_sol_data(sf, True)  for sf in data['sol_file']], 
                    max_length=context_length, 
                    padding="max_length",
                  return_overflowing_tokens=True,
                  return_length=True,
                  truncation=True) #  truncation=True, max_length=16, return_length=True, return_overflowing_tokens=True)
    
    input_batch = []
    for l, ids in zip(t['length'], t['input_ids']):
        if l == context_length:
            input_batch.append(ids)

    return {'input_ids': input_batch}

In [16]:
x = hf.map(tokenize_function, batch_size=10, batched=True, remove_columns=hf.column_names)#.map(group_texts, batched=True)

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map: 100%|██████████| 466/466 [00:00<00:00, 657.99 examples/s]


In [12]:
len(x[2]['input_ids'])
#len(x[2]['labels'])


128

In [13]:
len(x)

4717

In [14]:
tokenizer.decode(x['labels'][0])

'pragma solidity ^0.4.23;\n\ncontract DSAuthority {\n    function canCall(\n        address src, address dst, bytes4 sig\n    ) public view returns (bool);\n}\n\ncontract DSAuthEvents {\n    event LogSetAuthority (address indexed authority);\n    event LogSetOwner     (address indexed owner);\n}\n\ncontract DSAuth is DSAuthEvents {\n    DSAuthority  public  authority;\n    address      public  owner;\n\n    constructor() public {\n        owner = msg.sender;\n        emit LogSetOwner(msg.sender);'

In [15]:
tokenizer.decode(x['input_ids'][0])

'pragma solidity ^0.4.23;\n\ncontract DSAuthority {\n    function canCall(\n        address src, address dst, bytes4 sig\n    ) public view returns (bool);\n}\n\ncontract DSAuthEvents {\n    event LogSetAuthority (address indexed authority);\n    event LogSetOwner     (address indexed owner);\n}\n\ncontract DSAuth is DSAuthEvents {\n    DSAuthority  public  authority;\n    address      public  owner;\n\n    constructor() public {\n        owner = msg.sender;\n        emit LogSetOwner(msg.sender);'

In [18]:
x.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 16983
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 1888
    })
})