In [1]:
import pandas as pd
from transformers import RobertaTokenizer
import datasets
from datasets import Dataset, DatasetDict

data_path = './comment_code_sol.pkl'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = pd.read_pickle(data_path)

In [3]:
# drop all code and comment value not longer as 20 characters
print("INFO: Original dataset length:", len(dataset))

dataset=dataset[dataset['code_string'].str.len() >= 20]
print("INFO: dataset length after code length refinment:", len(dataset))

dataset=dataset[dataset['comments'].str.len() >= 20]
print("INFO: dataset length after comment length refinment:", len(dataset))


INFO: Original dataset length: 17211242
INFO: dataset length after code length refinment: 13728139
INFO: dataset length after comment length refinment: 12650850


In [4]:
# remove lines with no code
def hasMarker(code):
    if ';' in code:
        return True
    
    elif '}' in code:
        return True
    
    elif '{' in code:
        return True
    
    elif ';' in code:
        return True
    else:
        return False


def is_function(code):
    return True if 'function' in code else False

def discard_contract_or_lib(code):
    if "contract" in code:
        return False 

    if "library" in code:
        return False
    return True 

# remove docstring 
def strip_comment(com):
    com = com.replace('*','').strip()
    com = com.replace('@title','').strip()
    com = com.replace('@author','').strip()
    com = com.replace('@notice','').strip()
    com = com.replace('@dev','').strip()
    com = com.replace('@param','').strip()
    com = com.replace('#','').strip()
    com = com.replace('@return','return').strip()
    return com

dataset=dataset[dataset['code_string'].apply(lambda x: hasMarker(x))]
print("INFO: dataset length after marker processing:", len(dataset))

dataset=dataset[dataset['code_string'].apply(lambda x: is_function(x))]
print("INFO: dataset length after function processing:", len(dataset))

dataset=dataset[dataset['code_string'].apply(lambda x: discard_contract_or_lib(x))]
print("INFO: dataset discarding contracts and libs:", len(dataset))

dataset['comments']=dataset['comments'].apply(lambda x: strip_comment(x))
print("INFO: stripped comment!")
 
    

INFO: dataset length after marker processing: 12332748
INFO: dataset length after function processing: 9901914
INFO: dataset discarding contracts and libs: 8440378
INFO: stripped comment!


In [5]:
# Drop duplicate on code value
dataset = dataset.drop_duplicates(subset=['code_string'], keep='first')
print("INFO: dataset length after dropping duplicates:", len(dataset))


INFO: dataset length after dropping duplicates: 739927


In [6]:
dataset

Unnamed: 0,file_name,comments,code_string
2,// SPDX-License-Identifier: MIT\npragma solidi...,Withdraw ether from this contract (Callable by...,function withdraw() onlyOwner public {\n ...
3,// SPDX-License-Identifier: MIT\npragma solidi...,"_setTokenURI(newTokenId, Strings.toString(newT...",function steamDumplings(uint256 numDumplin...
9,// SPDX-License-Identifier: MIT\npragma solidi...,See {IERC721-balanceOf}./,function balanceOf(address owner) public v...
10,// SPDX-License-Identifier: MIT\npragma solidi...,See {IERC721-ownerOf}./,function ownerOf(uint256 tokenId) public v...
11,// SPDX-License-Identifier: MIT\npragma solidi...,See {IERC721Metadata-name}./,function name() public view override retur...
...,...,...,...
17211114,./partial_match/77/0xa846788E1D9aB3F90f8bb9b1B...,Upgrades target newTarget New target newTarg...,"function upgradeTarget(address newTarget, ..."
17211115,./partial_match/77/0xa846788E1D9aB3F90f8bb9b1B...,Performs a delegatecall to the contract implem...,function() external payable {\n add...
17211120,./partial_match/77/0xa846788E1D9aB3F90f8bb9b1B...,Notice period before activation preparation st...,function getNoticePeriod() external return...
17211125,./partial_match/77/0xa846788E1D9aB3F90f8bb9b1B...,Checks that contract is ready for upgrade retu...,function isReadyForUpgrade() external retu...


In [6]:
hf_dataset = Dataset.from_pandas(dataset)
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=100)
test_valid = hf_dataset['test'].train_test_split(test_size=0.5, seed=100)

Hdataset = DatasetDict({
            'train': hf_dataset['train'],
            'test': test_valid['test'],
            'valid': test_valid['train']
            })

In [9]:
Hdataset

DatasetDict({
    train: Dataset({
        features: ['file_name', 'comments', 'code_string', '__index_level_0__'],
        num_rows: 591941
    })
    test: Dataset({
        features: ['file_name', 'comments', 'code_string', '__index_level_0__'],
        num_rows: 73993
    })
    valid: Dataset({
        features: ['file_name', 'comments', 'code_string', '__index_level_0__'],
        num_rows: 73993
    })
})

In [10]:
import os 
Hdataset.push_to_hub("Pipper/SolFuncs",  token=os.environ.get("HF_TOKEN"), max_shard_size="1GB")

Pushing split train to the Hub.
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:02<00:00, 13.30ba/s]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.53ba/s]74s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.60ba/s]71s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.92ba/s]94s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:05<00:00,  6.18ba/s]16s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:05<00:00,  6.18ba/s]18s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.93ba/s]23s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.54ba/s]16s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.55ba/s].03s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:06<00:00,  5.29ba/s].95s/it]
Creating parquet from Arrow format: 100%|██████████| 37/37 [00:

In [None]:
i = 6
print(dataset.comments.iloc[i])
print('-'*100)
print(dataset.code_string.iloc[i])

In [7]:
dataset.to_pickle('./filtered_comment_code_sol.pkl')