##Hugging Face Transformers for AutoTokenizer and CodeBERT

In [8]:
!pip install transformers



##Tree-sitter for Syntax-aware Tokenization

In [7]:
!pip install tree-sitter-language-pack

Collecting tree-sitter-language-pack
  Downloading tree_sitter_language_pack-0.9.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (17 kB)
Collecting tree-sitter>=0.23.2 (from tree-sitter-language-pack)
  Downloading tree_sitter-0.25.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (10.0 kB)
Collecting tree-sitter-c-sharp>=0.23.1 (from tree-sitter-language-pack)
  Downloading tree_sitter_c_sharp-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting tree-sitter-embedded-template>=0.23.2 (from tree-sitter-language-pack)
  Downloading tree_sitter_embedded_template-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl.metadata (2.3 kB)
Collecting tree-sitter-yaml>=0.7.0 (from tree-sitter-language-pack)
  Downloading tree_sitter_yaml-0.7.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.

##Build solidity parser

In [8]:
from tree_sitter_language_pack import get_language, get_parser

solidity_lang = get_language("solidity")
parser = get_parser("solidity")


##Parsing Solidity code into tokens

In [24]:
import os

def parsing_code(path_to_files):
  dir_path = os.path.dirname(path_to_files)
  for file_name in os.listdir(dir_path):
      print(file_name)
      file_path = os.path.join(dir_path, file_name)
      if os.path.isfile(file_path):
          with open(file_path, 'r', encoding='utf-8') as f:
              code = f.read()
              code_bytes = bytes(code, 'utf-8')
              tree = parser.parse(code_bytes)
              tokens = extract_tokens(tree.root_node, code_bytes)
              for token_type, token_text in tokens:
                  print(f"{token_type}: {token_text}")



In [32]:
parsing_code("owaspTop10_25_SC/")

unchecked_external_calls.sol
pragma: pragma
solidity: solidity
^:  ^
solidity_version: 0.4.24
;: ;
contract: contract
identifier: Solidity_UncheckedExternalCall
{: {
address: address
public: public
identifier: owner
;: ;
constructor: constructor
(: (
): )
public: public
{: {
identifier: owner
=: =
identifier: msg
.: .
identifier: sender
;: ;
}: }
function: function
identifier: forward
(: (
address: address
identifier: callee
,: ,
bytes: bytes
identifier: _data
): )
public: public
{: {
identifier: require
(: (
identifier: callee
.: .
identifier: delegatecall
(: (
identifier: _data
): )
): )
;: ;
}: }
}: }
price_oracle_manipulation.sol
pragma: pragma
solidity: solidity
^:  ^
solidity_version: 0.8.0
;: ;
interface: interface
identifier: IPriceFeed
{: {
function: function
identifier: getLatestPrice
(: (
): )
external: external
view: view
returns: returns
(: (
int: int
): )
;: ;
}: }
contract: contract
identifier: PriceOracleManipulation
{: {
address: address
public: public
identifier: owne

##Extracting tokens from solidity code

In [17]:
def extract_tokens(node, code):
    tokens = []
    if node.child_count == 0:
        token_text = code[node.start_byte:node.end_byte].decode('utf-8')
        tokens.append((node.type, token_text))
    for child in node.children:
        tokens.extend(extract_tokens(child, code))
    return tokens

In [33]:
# An example query for a potential reentrancy pattern
reentrancy_query_string = """
(call_expression
  function: (member_expression
    property: (property_identifier) @call
    object: (identifier) @object)
) @vulnerable_call

(assignment_expression) @state_change
"""

In [34]:
#reentrancy_query = solidity_lang.query(reentrancy_query_string)

In [35]:
#solidity_lang.query(reentrancy_query)

In [96]:
code = open("/content/owaspTop10_25_SC/dos.sol").read()
code_bytes = bytes(code, 'utf-8')
tree = parser.parse(code_bytes)
tokens = extract_tokens(tree.root_node, code_bytes)

In [97]:

print(tokens)

[('pragma', 'pragma'), ('solidity', 'solidity'), ('^', ' ^'), ('solidity_version', '0.8.24'), (';', ';'), ('contract', 'contract'), ('identifier', 'Solidity_DOS'), ('{', '{'), ('address', 'address'), ('public', 'public'), ('identifier', 'king'), (';', ';'), ('uint256', 'uint256'), ('public', 'public'), ('identifier', 'balance'), (';', ';'), ('function', 'function'), ('identifier', 'claimThrone'), ('(', '('), (')', ')'), ('external', 'external'), ('payable', 'payable'), ('{', '{'), ('identifier', 'require'), ('(', '('), ('identifier', 'msg'), ('.', '.'), ('identifier', 'value'), ('>', '>'), ('identifier', 'balance'), (',', ','), ('"', '"'), ('"', '"'), (')', ')'), (';', ';'), ('(', '('), ('bool', 'bool'), ('identifier', 'sent'), (',', ','), (')', ')'), ('=', '='), ('identifier', 'king'), ('.', '.'), ('identifier', 'call'), ('{', '{'), ('identifier', 'value'), (':', ':'), ('identifier', 'balance'), ('}', '}'), ('(', '('), ('"', '"'), ('"', '"'), (')', ')'), (';', ';'), ('identifier', 're

##Clean tokens

In [107]:
import re

def clean_tokens(unclean_tokens):
  clean_tokens = []
  for token_type, token_value in unclean_tokens:
    if token_type == "identifier":
      clean_tokens.append(token_value)
    elif token_type in ["mapping","address","public","private","internal", "constructor","function","contract","payable","external","bool"] or token_value.startswith("uint"):
      clean_tokens.append(token_value)
  return clean_tokens

In [108]:
print(clean_tokens(tokens))

['contract', 'Solidity_DOS', 'address', 'public', 'king', 'uint256', 'public', 'balance', 'function', 'claimThrone', 'external', 'payable', 'require', 'msg', 'value', 'balance', 'bool', 'sent', 'king', 'call', 'value', 'balance', 'require', 'sent', 'balance', 'msg', 'value', 'king', 'msg', 'sender']
