In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import solcx
from feature_extractor import extract_features_from_raw_code

In [11]:
import solcx
solcx.set_solc_version("0.8.0")  # Use the manually installed version
print("Active Solidity version:", solcx.get_solc_version())

Active Solidity version: 0.8.0


In [12]:
df_secure = pd.read_csv('bccc-vulscs-2023/BCCC-VolSCs-2023_Secure.csv')

df_vulnerable = pd.read_csv('bccc-vulscs-2023/BCCC-VolSCs-2023_Vulnerable.csv')

df_combined = pd.concat([df_secure, df_vulnerable], ignore_index=True)

print("Class Distribution in Entire Dataset:")
print(df_combined['label'].value_counts())


Class Distribution in Entire Dataset:
label
0    26915
1     9756
Name: count, dtype: int64


In [13]:
df_combined.head()
df = df_combined

In [14]:
# Select relevant columns for training
relevant_columns = [
    'bytecode_len', 'bytecode_entropy', 'ast_len_nodes',  # Bytecode and AST features
] + [col for col in df.columns if 'Weight bytecode_character_' in col]  # Character weights

X = df[relevant_columns]
y = df['label']  # Labels

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [18]:
# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

In [19]:
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=2,  # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [21]:
# Fit the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Best model from GridSearchCV
model = grid_search.best_estimator_

Fitting 2 folds for each of 24 candidates, totalling 48 fits


In [23]:
# Evaluate the model
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)


print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7597818677573279

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.96      0.85      5384
           1       0.65      0.21      0.32      1951

    accuracy                           0.76      7335
   macro avg       0.71      0.59      0.59      7335
weighted avg       0.74      0.76      0.71      7335


Confusion Matrix:
[[5157  227]
 [1535  416]]


In [None]:
def predict_solidity_code(raw_code):
    # Extract features from the raw Solidity code
    features = extract_features_from_raw_code(raw_code)
    features_df = pd.DataFrame([features])

    # Align features with the model's trained feature set
    for col in X_train.columns:  # X_train is the DataFrame used for training
        if col not in features_df.columns:
            features_df[col] = 0  # Add missing columns with default value 0

    # Drop any unexpected columns
    features_df = features_df[X_train.columns]

    # Predict
    prediction = model.predict(features_df)
    prediction_proba = model.predict_proba(features_df)
    return prediction[0], prediction_proba[0]


In [11]:
raw_code_example = """
pragma solidity ^0.8.0;

contract Example {
    uint256 private value;

    function setValue(uint256 _value) public {
        value = _value;
    }

    function getValue() public view returns (uint256) {
        return value;
    }
}
"""

In [12]:
pred, proba = predict_solidity_code(raw_code_example)
print("Prediction (0 = Secure, 1 = Vulnerable):", pred)
print("Prediction Probability:", proba)

Prediction (0 = Secure, 1 = Vulnerable): 0
Prediction Probability: [0.64 0.36]
