# Import libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D,  Flatten, Dense, Dropout

# Load and preprocess the data

In [3]:
file_path = './dataset/extract.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,gas,hash,input,nonce,value,block_number,block_hash,transaction_index,from_address,to_address,...,receipt_gas_used,receipt_contract_address,receipt_status,receipt_effective_gas_price,transaction_type,max_fee_per_gas,max_priority_fee_per_gas,block_timestamp,date,last_modified
0,373326,0xa92d47142f13f96991580fbe3b00f90d09e938ce2f5c...,0x153a395710e809d7003de1253fa7935a41d6291cf071...,13745,193269.0,18246421,0xdd8d3a5872f30406ae4fd2473936af9fe39dbad30510...,0,0x202bb2fab1e35d940fde99b214ba49dafbcef62a,0x00fc00900000002c00be4ef8f49c000211000c43,...,248884,,1,6657741714,2,6657741714,0,30/09/2023 05:25,2023-09-30,01/10/2023 00:20
1,246120,0xfd889900ac45be0d6a59e4fd35b58c3f2272aac4fecf...,0x55e4b7be00000000000000000000000040fd72257597...,2105,0.0,18246421,0xdd8d3a5872f30406ae4fd2473936af9fe39dbad30510...,1,0xdb0937f8a4242360c2e989f139e105917ac7458b,0x00000000009726632680fb29d3f7a9734e3010e2,...,175313,,1,8157741714,2,8840729391,1500000000,30/09/2023 05:25,2023-09-30,01/10/2023 00:20
2,331041,0x309bee3f88a638b5caf2a3f047a10e7764161732ed10...,0x3593564c000000000000000000000000000000000000...,502,0.0,18246421,0xdd8d3a5872f30406ae4fd2473936af9fe39dbad30510...,2,0x0b22861932d2845db1543319b8512604f777c761,0x3fc91a3afd70395cd496c647d5a6cc9d4b2b7fad,...,254854,,1,6697958290,2,7142358390,40216576,30/09/2023 05:25,2023-09-30,01/10/2023 00:20
3,46506,0x93060c31318c3af3cf1f72defe5cd1d9d59adf6141d1...,0x095ea7b3000000000000000000000000881d40237659...,143,0.0,18246421,0xdd8d3a5872f30406ae4fd2473936af9fe39dbad30510...,3,0x63efb91c7a727beaa22466b855f9a86e3be01575,0xc944e90c64b2c07662a292be6244bdf05cda44a7,...,46506,,1,6957741714,2,10308286565,300000000,30/09/2023 05:25,2023-09-30,01/10/2023 00:20
4,460262,0xc182722b111eeef7199040c97c2de334ff0123e000ee...,0x5f575529000000000000000000000000000000000000...,144,0.0,18246421,0xdd8d3a5872f30406ae4fd2473936af9fe39dbad30510...,4,0x63efb91c7a727beaa22466b855f9a86e3be01575,0x881d40237659c251811cec9c364ef91dc08d300c,...,269452,,1,6957741714,2,10308286565,300000000,30/09/2023 05:25,2023-09-30,01/10/2023 00:20


# Columns

In [4]:
data.columns

Index(['gas', 'hash', 'input', 'nonce', 'value', 'block_number', 'block_hash',
       'transaction_index', 'from_address', 'to_address', 'gas_price',
       'receipt_cumulative_gas_used', 'receipt_gas_used',
       'receipt_contract_address', 'receipt_status',
       'receipt_effective_gas_price', 'transaction_type', 'max_fee_per_gas',
       'max_priority_fee_per_gas', 'block_timestamp', 'date', 'last_modified'],
      dtype='object')

# Data preprocessing

In [5]:
data_cleaned = data.drop(columns=['hash', 'input', 'block_hash', 'from_address', 'to_address', 'receipt_contract_address', 'last_modified'])
data_cleaned.fillna(0, inplace=True)

# Label Encoding

In [6]:
label_encoders = {}
categorical_columns = ['receipt_status']

for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    data_cleaned[column] = label_encoders[column].fit_transform(data_cleaned[column])

numerical_columns = data_cleaned.select_dtypes(include=[np.number]).columns.tolist()
scaler = StandardScaler()
data_cleaned[numerical_columns] = scaler.fit_transform(data_cleaned[numerical_columns])

# Initial anomaly detection using Isolation Forest

In [7]:
clf = IsolationForest(contamination='auto', random_state=42)
X = data_cleaned.drop(columns=['block_timestamp', 'date'])
clf.fit(X)
data_cleaned['anomaly'] = clf.predict(X)
data_cleaned['anomaly'] = data_cleaned['anomaly'].map({1: 'normal', -1: 'anomaly'})

# Prepare data for CNN

In [8]:
time_steps = 1
features = X.shape[1]
X_cnn = X.values.reshape(X.shape[0], time_steps, features)
y_cnn = (data_cleaned['anomaly'] == 'anomaly').astype(int).values

# Define CNN model

In [10]:
model = Sequential([
    Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(time_steps, features)),
    Flatten(),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compiling

In [11]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model

In [12]:
model.fit(X_cnn, y_cnn, epochs=50, batch_size=32, validation_split=0.2)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x248d2044f10>

# Evaluate the model

In [13]:
loss, accuracy = model.evaluate(X_cnn, y_cnn)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9850000143051147
