In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# nltk.download('stopwords')


In [19]:
df = pd.read_csv("SC_Vuln_8label.csv")
print(df.head())
print(df.isnull().sum()) #no missing values

   Unnamed: 0   filename                                               code  \
0           0  33790.sol  pragma solidity ^0.4.4;\n\ncontract Token {\n\...   
1           1  31454.sol  pragma solidity ^0.4.4;\n\ncontract Token {\n\...   
2           2  40744.sol  contract SendBalance {\n mapping (address => u...   
3           3  39290.sol  /**\n * Originally from https://github.com/Con...   
4           4  39358.sol  pragma solidity ^0.4.4;\n\nlibrary ArrayLib{\n...   

                        label  label_encoded  
0  ./Dataset/reentrancy (RE)/              5  
1  ./Dataset/reentrancy (RE)/              5  
2  ./Dataset/reentrancy (RE)/              5  
3  ./Dataset/reentrancy (RE)/              5  
4  ./Dataset/reentrancy (RE)/              5  
Unnamed: 0       0
filename         0
code             0
label            0
label_encoded    0
dtype: int64


In [62]:
value_counts = df['label_encoded'].value_counts()
print(value_counts)

label_encoded
5    1218
7    1199
4     590
0     406
3     366
6     312
1      97
2      97
Name: count, dtype: int64


In [65]:
print(df['code'][0])

pragma solidity ^0.4.4;

contract Token {

    /// @return total amount of tokens
    function totalSupply(uint256) constant returns (uint256 supply) {}

    /// @param _owner The address from which the balance will be retrieved
    /// @return The balance
    function balanceOf(address _owner) constant returns (uint256 balance) {}

    /// @notice send `_value` token to `_to` from `msg.sender`
    /// @param _to The address of the recipient
    /// @param _value The amount of token to be transferred
    /// @return Whether the transfer was successful or not
    function transfer(address _to, uint256 _value) returns (bool success) {}

    /// @notice send `_value` token to `_to` from `_from` on the condition it is approved by `_from`
    /// @param _from The address of the sender
    /// @param _to The address of the recipient
    /// @param _value The amount of token to be transferred
    /// @return Whether the transfer was successful or not
    function transferFrom(address _from, a

In [31]:
# solidity_stopwords = [
#     "pragma", "interface", "contract", "function", "event", "modifier", "library", "using",
#     "string", "uint8", "uint256", "address", "mapping", "bool", "require", "return", "memory",
#     "storage", "public", "internal", "view", "constant", "constructor",
#     "_owner", "_balances", "_allowances", "_founder", "_marketing", "_who", "_burntAmount",
#     "_from", "_to", "_value", "_timestamp", "_bool", "msg.sender", "totalSupply",
#     "balanceOf", "transfer", "allowance", "approve", "transferFrom", "add", "sub", "mul", "div",
#     "mod", "changeFounder", "setMinter", "setFurnace", "freezeAccount", "solidity", "bytes32"
# ]

def clean_code(code):
    # Remove comments (single-line and multi-line)
    code = re.sub(r'\/\/.*', '', code)
    code = re.sub(r'\/\*[\s\S]*?\*\/', '', code)
    code = re.sub(r"pragma solidity[^;]+;", "", code)
    code = re.sub(r"import [^;]+;", "", code)
    code = re.sub(r"\s+", " ", code).strip()

    string_literals = re.findall(r'"[^"]*"|\'[^\']*\'', code)
    string_map = {s: f'__STRING{idx}__' for idx, s in enumerate(string_literals)}
    for string, placeholder in string_map.items():
        code = code.replace(string, placeholder)
    
    return code
    
# def clean_code(solidity_code):
#     cleaned_code = re.sub(r'//.*?$', '', solidity_code, flags=re.MULTILINE)
#     cleaned_code = re.sub(r'/\*.*?\*/', '', cleaned_code, flags=re.DOTALL)
#     cleaned_code = re.sub(r'[^a-zA-Z0-9\s+=\-*&|/<>!(){};.,]', '', cleaned_code)
#     cleaned_code = '\n'.join(line.strip().lower() for line in cleaned_code.splitlines() if line.strip())

#     stop_words = set(stopwords.words('english'))
#     tokens = [word for word in cleaned_code.split() if word not in stop_words]
#     tokens = [token for token in tokens if token not in solidity_stopwords]

#     return tokens


df['clean_code'] = df['code'].apply(clean_code)
print(df['clean_code'])


0       contract Token { function totalSupply(uint256)...
1       contract Token { function totalSupply() consta...
2       contract SendBalance { mapping (address => uin...
3       contract MultiSigWallet { uint constant public...
4       library ArrayLib{ function findAddress(address...
                              ...                        
4280    contract MultiSigWallet { uint constant public...
4281    contract Token { function totalSupply() consta...
4282    contract Token { function totalSupply() consta...
4283    interface tokenRecipient { function receiveApp...
4284    contract Token { uint256 public totalSupply; f...
Name: clean_code, Length: 4285, dtype: object


In [21]:
#Tokenization helps extract important elements from the code
solidity_token_pattern = r"""
    [A-Za-z_][A-Za-z_0-9]*      
    |\d+\.\d+|\d+               
    |0x[0-9a-fA-F]+             
    |\d+e[+-]?\d+              
    |==|!=|>=|<=|\+=|-=|\*=|/=  
    |=>|=<|>>|<<|\+\+|--        
    |&&|\|\||!                  
    |[^\w\s]                    
"""

tokenizer = RegexpTokenizer(solidity_token_pattern, flags=re.VERBOSE)

df['tokens'] = df['clean_code'].apply(tokenizer.tokenize)
print(df['tokens'][0])

['contract', 'Token', '{', 'function', 'totalSupply', '(', 'uint256', ')', 'constant', 'returns', '(', 'uint256', 'supply', ')', '{', '}', 'function', 'balanceOf', '(', 'address', '_owner', ')', 'constant', 'returns', '(', 'uint256', 'balance', ')', '{', '}', 'function', 'transfer', '(', 'address', '_to', ',', 'uint256', '_value', ')', 'returns', '(', 'bool', 'success', ')', '{', '}', 'function', 'transferFrom', '(', 'address', '_from', ',', 'address', '_to', ',', 'uint256', '_value', ')', 'returns', '(', 'bool', 'success', ')', '{', '}', 'function', 'approve', '(', 'address', '_spender', ',', 'uint256', '_value', ')', 'returns', '(', 'bool', 'success', ')', '{', '}', 'function', 'allowance', '(', 'address', '_owner', ',', 'address', '_spender', ')', 'constant', 'returns', '(', 'uint256', 'remaining', ')', '{', '}', 'event', 'Transfer', '(', 'address', 'indexed', '_from', ',', 'address', 'indexed', '_to', ',', 'uint256', '_value', ')', ';', 'event', 'Approval', '(', 'address', 'indexed

In [34]:
#Feature Extraction to numerical features for TF-IDF
df['token_string'] = df['tokens'].apply(lambda x: ' '.join(x))

vectorizer = TfidfVectorizer(max_features=1000) 
X_tfidf = vectorizer.fit_transform(df['token_string'])


In [30]:
encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['label'])

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['label_encoded'], test_size=0.2, random_state=42)