In [1]:
import os
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
import pandas  as df

# Define the path to the directory containing the folders
data_directory = "dataset"

# Initialize lists to store X and y
X = []
y = []

# Iterate through each folder
for folder in os.listdir(data_directory):
    folder_path = os.path.join(data_directory, folder)
    
    # Check if the item is a directory
    if os.path.isdir(folder_path):
        
        # Iterate through each file in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            # Check if the item is a file and has .sol extension
            if os.path.isfile(file_path) and file_name.endswith(".sol"):
                # Read the content of the file
                with open(file_path, "r") as file:
                    sol_content = file.read()
                
                # Append the content to X and the label to y
                X.append(sol_content)
                y.append(folder)


2024-07-26 04:31:46.249524: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import pandas as pd 
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20

# Initialize the notebook output for Bokeh
output_notebook()

# Display the distribution of samples per label using Bokeh
# Create a DataFrame for easy manipulation
data = pd.DataFrame({'X': X, 'y': y})

# Group by the label and count the occurrences
label_counts = data['y'].value_counts().reset_index()
label_counts.columns = ['label', 'count']

# Create a figure for the plot
p = figure(x_range=label_counts['label'], 
           height=400, 
           width=600, 
           title="Vulnerabilities Samples Distribution",
           toolbar_location=None, 
           tools="")

# Add a vbar (vertical bar) glyph to the figure without a legend
p.vbar(x='label', 
       top='count', 
       width=0.9, 
       source=label_counts, 
       line_color='white', 
       fill_color=factor_cmap('label', palette=Category20[len(label_counts)], factors=label_counts['label']))

# Set plot properties
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.axis_label = "Vulnerabity Types"
p.yaxis.axis_label = "No. of samples"
p.xaxis.major_label_orientation = 1.2

# Show the plot
show(p)


In [3]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20
from bokeh.models import ColumnDataSource

# Initialize the notebook output for Bokeh
output_notebook()

# Display the distribution of samples per label using Bokeh
# Create a DataFrame for easy manipulation
data = pd.DataFrame({'X': X, 'y': y})

# Group by the label and count the occurrences
label_counts = data['y'].value_counts().reset_index()
label_counts.columns = ['label', 'count']

# Convert the data into a ColumnDataSource
source = ColumnDataSource(label_counts)

# Create a figure for the plot with horizontal bars
p = figure(y_range=label_counts['label'], 
           height=400, 
           width=600, 
           title="Vulnerabilities Samples Distribution",
           toolbar_location=None, 
           tools="")

# Add a hbar (horizontal bar) glyph to the figure
p.hbar(y='label', 
       right='count', 
       height=0.9, 
       source=source, 
       line_color='white', 
       fill_color=factor_cmap('label', palette=Category20[len(label_counts)], factors=label_counts['label']))

# Set plot properties
p.ygrid.grid_line_color = None
p.x_range.start = 0
p.yaxis.axis_label = "Vulnerability Types"
p.xaxis.axis_label = "No. of Samples"
#p.yaxis.major_label_orientation = 1.2

# Show the plot
show(p)


### SAVE ORGINIAL DATASET TO JSON 

In [4]:
import json

# Create a list to store the data
data = []

# Iterate through X and y and create a dictionary for each pair
for x_value, y_value in zip(X, y):
    data.append({"X": x_value, "y": y_value})

# Define the path for the JSON file
json_file_path = "Benchmark_Solidity_Imbalanced.json"

# Write the data to the JSON file
with open(json_file_path, "w") as json_file:
    json.dump(data, json_file)

print("JSON file created successfully:", json_file_path)


JSON file created successfully: Benchmark_Solidity_Imbalanced.json


### SAMPLING DATA BY SMOTE AND SAVE IT

In [5]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
import json

# Define the path to the directory containing the folders
data_directory = "dataset"

# Initialize lists to store X (text) and y (labels)
X = []
y = []

# Iterate through each folder
for folder in os.listdir(data_directory):
    folder_path = os.path.join(data_directory, folder)
    
    # Check if the item is a directory
    if os.path.isdir(folder_path):
        
        # Iterate through each file in the folder
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            # Check if the item is a file and has .sol extension
            if os.path.isfile(file_path) and file_name.endswith(".sol"):
                # Read the content of the file
                with open(file_path, "r") as file:
                    sol_content = file.read()
                
                # Append the content to X and the label to y
                X.append(sol_content)
                y.append(folder)

# Create a DataFrame for convenience
df = pd.DataFrame({'text': X, 'label': y})

# Remove empty or whitespace-only documents
df = df[df['text'].str.strip().astype(bool)]

# Use TF-IDF Vectorizer to convert text to numerical features
#tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
#X_tfidf = tfidf_vectorizer.fit_transform(df['text'])

# Apply Random Oversampling to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(df['text'].values.reshape(-1, 1), df['label'])

# Convert resampled data to DataFrame
resampled_df = pd.DataFrame({'text': X_resampled.ravel(), 'label': y_resampled})

# Define the path for the JSON file
json_file_path = "Benchmark_Solidity_Balanced.json"

# Write the resampled data to the JSON file
resampled_data = []
for idx, row in resampled_df.iterrows():
    resampled_data.append({"X": row['text'], "y": row['label']})

with open(json_file_path, "w") as json_file:
    json.dump(resampled_data, json_file)

print("JSON file created successfully:", json_file_path)


JSON file created successfully: Benchmark_Solidity_Balanced.json


In [6]:
y_resampled

0         RE1
1         RE1
2         RE1
3         RE1
4         RE1
         ... 
13555    UpS1
13556    UpS1
13557    UpS1
13558    UpS1
13559    UpS1
Name: label, Length: 13560, dtype: object

In [7]:
import pandas as pd
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import factor_cmap
from bokeh.palettes import Category20
from bokeh.models import ColumnDataSource

# Initialize the notebook output for Bokeh
output_notebook()

# Create a DataFrame for easy manipulation
data = pd.DataFrame({'y': y_resampled})

# Group by the label and count the occurrences
label_counts = data['y'].value_counts().reset_index()
label_counts.columns = ['label', 'count']

# Convert the data into a ColumnDataSource
source = ColumnDataSource(label_counts)

# Create a figure for the plot with horizontal bars
p = figure(y_range=label_counts['label'], 
           height=400, 
           width=600, 
           title="Vulnerabilities Samples Distribution",
           toolbar_location=None, 
           tools="")

# Add a hbar (horizontal bar) glyph to the figure
p.hbar(y='label', 
       right='count', 
       height=0.9, 
       source=source, 
       line_color='white', 
       fill_color=factor_cmap('label', palette=Category20[len(label_counts)], factors=label_counts['label']))

# Set plot properties
p.ygrid.grid_line_color = None
p.x_range.start = 0
p.yaxis.axis_label = "Vulnerability Types"
p.xaxis.axis_label = "No. of Samples"
#p.yaxis.major_label_orientation = 1.2

# Show the plot
show(p)
