In [2]:
import solcx
import re
from collections import Counter
import numpy as np

# Install solcx if needed
solcx.install_solc(version="0.8.0")  # Adjust to the desired Solidity version

<Version('0.8.0')>

In [3]:
import solcx
import re
from collections import Counter
import numpy as np

# Install solcx if needed
solcx.install_solc(version="0.8.0")  # Adjust to the desired Solidity version

def extract_features_from_raw_code(source_code):
    """
    Extracts features from raw Solidity code text.
    """
    # Compile the Solidity code
    compiled_sol = solcx.compile_source(source_code, output_values=["abi", "bin", "ast"])
    
    # Replace '<stdin>:<ContractName>' with the contract's name dynamically
    contract_name = list(compiled_sol.keys())[0]
    bytecode = compiled_sol[contract_name]["bin"]
    bytecode_len = len(bytecode)
    
    # Bytecode character weights
    bytecode_chars = Counter(bytecode)
    total_chars = sum(bytecode_chars.values())
    char_weights = {f"Weight bytecode_character_{char}": bytecode_chars.get(char, 0) / total_chars for char in "0123456789abcdef"}
    
    # Opcode weights
    opcodes = re.findall(r"[A-Z]+", bytecode)  # Simplistic extraction of opcodes
    opcode_weights = Counter(opcodes)
    total_opcodes = sum(opcode_weights.values())
    opcode_weights_normalized = {f"Opcode weight {op}": opcode_weights[op] / total_opcodes for op in opcode_weights}
    
    # Bytecode entropy
    bytecode_entropy = -sum((count / total_chars) * np.log2(count / total_chars) for count in bytecode_chars.values())
    
    # AST features
    ast = compiled_sol[contract_name]["ast"]
    ast_len_nodes = len(ast["nodes"])
    
    # Combine features into a dictionary
    features = {
        "bytecode_len": bytecode_len,
        "bytecode_entropy": bytecode_entropy,
        "ast_len_nodes": ast_len_nodes,
        **char_weights,
        **opcode_weights_normalized,
    }
    
    return features


In [4]:

# Example usage
solidity_code = """
pragma solidity ^0.8.0;

contract SimpleStorage {
    uint256 private data;

    function set(uint256 x) public {
        data = x;
    }

    function get() public view returns (uint256) {
        return data;
    }
}
"""

features = extract_features_from_raw_code(solidity_code)
print(features)


{'bytecode_len': 670, 'bytecode_entropy': np.float64(3.4136176281522164), 'ast_len_nodes': 2, 'Weight bytecode_character_0': 0.26119402985074625, 'Weight bytecode_character_1': 0.06119402985074627, 'Weight bytecode_character_2': 0.03880597014925373, 'Weight bytecode_character_3': 0.041791044776119404, 'Weight bytecode_character_4': 0.03582089552238806, 'Weight bytecode_character_5': 0.14328358208955225, 'Weight bytecode_character_6': 0.14776119402985075, 'Weight bytecode_character_7': 0.022388059701492536, 'Weight bytecode_character_8': 0.06119402985074627, 'Weight bytecode_character_9': 0.04626865671641791, 'Weight bytecode_character_a': 0.007462686567164179, 'Weight bytecode_character_b': 0.04626865671641791, 'Weight bytecode_character_c': 0.022388059701492536, 'Weight bytecode_character_d': 0.020895522388059702, 'Weight bytecode_character_e': 0.013432835820895522, 'Weight bytecode_character_f': 0.029850746268656716}
