<a href="https://colab.research.google.com/github/VMT01/google-colab/blob/main/TF_IDF_%26_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


# Import libraries

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Convolution2D, Dense, Dropout, LSTM, MaxPooling2D

# Define constants

In [None]:
vul_dir = '/content/gdrive/MyDrive/Colab Notebooks/data/Compressed.csv'

vuls_name = [
    'Unprotected Suicide.csv',
    'Unchecked call return value.csv',
    'Ugradeable contract.csv',
    'Timestamp dependence.csv',
    'Outdated Solidity version.csv',
    'Leaking Ether to arbitrary address.csv',
    'Frozen Ether.csv',
    'Delegatecall Injection.csv',
    'Authentication through tx.origin.csv',
]

vuls_count = len(vuls_name)

LINES = 5
TOKEN_PER_LINE = 1100
TOKENS = LINES * TOKEN_PER_LINE
LINES_LENGTH = LINES * TOKEN_PER_LINE * 3 - 1

In [None]:
print(vuls_count)

9


# Define feature extraction modules

In [None]:
def tfidf_module(X):
    v = TfidfVectorizer()
    X = v.fit_transform(X).toarray()

    return X, v.get_feature_names_out()

# Preprocess

## Read csv

In [None]:
def read_csv():
    def format_bytecode(bytecode):
        
        if len(bytecode) > LINES_LENGTH:
            return bytecode[:LINES_LENGTH]
        else:
            return (bytecode + ' 00' * ((LINES_LENGTH - len(bytecode)) // 3))


    def format_label(label):
        return np.array([int(x) for x in str(label).zfill(vuls_count)])

    df_vul = pd.read_csv(vul_dir, usecols=['BYTECODE', 'LABEL']).dropna()
    df_vul['BYTECODE'] = df_vul['BYTECODE'].apply(format_bytecode)
    df_vul['LABEL'] = df_vul['LABEL'].apply(format_label)

    df = pd.concat([df_vul])

    return df.sample(frac=1).reset_index(drop=True)

In [None]:
df = read_csv()

## Preprocess dataframe

In [None]:
X = df['BYTECODE'].values
X, features = tfidf_module(X)

In [None]:
print(features, len(features))

['00' '01' '02' '03' '04' '05' '06' '07' '08' '09' '0a' '0b' '0c' '0d'
 '0e' '0f' '0x' '10' '11' '12' '13' '14' '15' '16' '17' '18' '19' '1a'
 '1b' '1c' '1d' '1e' '1f' '20' '21' '22' '23' '24' '25' '26' '27' '28'
 '29' '2a' '2b' '2c' '2d' '2e' '2f' '30' '31' '32' '33' '34' '35' '36'
 '37' '38' '39' '3a' '3b' '3c' '3d' '3e' '3f' '40' '41' '42' '43' '44'
 '45' '46' '47' '48' '49' '4a' '4b' '4c' '4d' '4e' '4f' '50' '51' '52'
 '53' '54' '55' '56' '57' '58' '59' '5a' '5b' '5c' '5d' '5e' '5f' '60'
 '61' '62' '63' '64' '65' '66' '67' '68' '69' '6a' '6b' '6c' '6d' '6e'
 '6f' '70' '71' '72' '73' '74' '75' '76' '77' '78' '79' '7a' '7b' '7c'
 '7d' '7e' '7f' '80' '81' '82' '83' '84' '85' '86' '87' '88' '89' '8a'
 '8b' '8c' '8d' '8e' '8f' '90' '91' '92' '93' '94' '95' '96' '97' '98'
 '99' '9a' '9b' '9c' '9d' '9e' '9f' '__' 'a0' 'a1' 'a2' 'a3' 'a4' 'a5'
 'a6' 'a7' 'a8' 'a9' 'aa' 'ab' 'ac' 'ad' 'ae' 'af' 'b0' 'b1' 'b2' 'b3'
 'b4' 'b5' 'b6' 'b7' 'b8' 'b9' 'ba' 'bb' 'bc' 'bd' 'be' 'bf' 'c0' 'c1'
 'c2' 

In [None]:
def preprocess(df):
    # X = df['BYTECODE'].values
    # X, features = tfidf_module(X)
    X_len = len(X)

    X_new = np.empty((X_len, 1, 258))
    for x_index, x in enumerate(X):
        x_new = np.zeros(258)
        for index, item in enumerate(x):
            if features[index] == '0x':
                x_new[256] = item
            elif features[index] == '__':
                x_new[257] = item
            else:
                x_new[int('0x' + features[index], base=16)] = item
        X_new[x_index] = [x_new]

    y = np.concatenate(df['LABEL'].values).reshape(-1, vuls_count)
    return tf.convert_to_tensor(X_new), tf.convert_to_tensor(y)

In [None]:
X, y = preprocess(df)

[[[9.99867232e-01 2.14276526e-03 3.90877608e-04 ... 1.17290992e-02
   0.00000000e+00 0.00000000e+00]]

 [[4.95006342e-01 1.17928280e-01 1.19814601e-02 ... 7.00332025e-01
   0.00000000e+00 0.00000000e+00]]

 [[9.99865999e-01 2.14234574e-03 3.90801080e-04 ... 1.19222496e-02
   0.00000000e+00 0.00000000e+00]]

 ...

 [[3.57965227e-01 1.34397225e-01 1.68550172e-02 ... 7.94480381e-01
   0.00000000e+00 0.00000000e+00]]

 [[3.58382826e-01 1.35254814e-01 1.68746801e-02 ... 7.94703936e-01
   0.00000000e+00 0.00000000e+00]]

 [[3.58420105e-01 1.35268883e-01 1.75796202e-02 ... 7.94786600e-01
   0.00000000e+00 0.00000000e+00]]]


## Divide data

In [None]:
def split_data(data, ratio):
    index = int(len(data) * ratio)
    return data[:index], data[index:]

In [None]:
def divide_data(X, y):
    X_train, X_test = split_data(X, 0.7)
    y_train, y_test = split_data(y, 0.7)

    X_train, X_val = split_data(X_train, 0.8)
    y_train, y_val = split_data(y_train, 0.8)

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)


# Modeling

## LSTM

In [None]:
def LSTM_model(input_shape, output_shape):
    model = Sequential()

    # model.add(Convolution2D(512, input_shape=input_shape, activation='relu', kernel_size=(3, 3)))
    # model.add(MaxPooling2D((2, 2)))
    model.add(LSTM(512,input_shape=input_shape, return_sequences=True, activation='relu'))
    model.add(Dropout(0.2))
    model.add(LSTM(256, return_sequences=True, activation='relu'))
    model.add(Dropout(0.2))
    model.add(LSTM(128, return_sequences=True, activation='relu'))
    model.add(Dropout(0.1))
    model.add(LSTM(64, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(output_shape, activation='softmax'))

    model.summary()
    model.compile(optimizer='adam', loss=tf., metrics=['acc'])

    return model


# Main

In [None]:
def main():
    # df = read_csv()
    # X, y = preprocess(df) (len, 1, 256)
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = divide_data(X, y)
    # X_train = np.reshape(X_train, (-1, 1, 258, 1))
    # X_val = np.reshape(X_val, (-1, 1, 258, 1))

    model = LSTM_model((X_train.shape[1], X_train.shape[2]), y_train.shape[1])
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
    return history

In [None]:
history = main()

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()



Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_44 (LSTM)              (None, 1, 512)            1579008   
                                                                 
 dropout_44 (Dropout)        (None, 1, 512)            0         
                                                                 
 lstm_45 (LSTM)              (None, 1, 256)            787456    
                                                                 
 dropout_45 (Dropout)        (None, 1, 256)            0         
                                                                 
 lstm_46 (LSTM)              (None, 1, 128)            197120    
                                                                 
 dropout_46 (Dropout)        (None, 1, 128)            0         
                                                                 
 lstm_47 (LSTM)              (None, 64)              

KeyboardInterrupt: ignored