In [1]:
!pip install xgboost



In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier  # Make sure xgboost is installed
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import pickle

In [3]:
# -------------------------------
# 1. Load and Combine the Datasets
# -------------------------------
df1 = pd.read_csv("/kaggle/input/ethereum-transactions-for-fraud-detection/first_order_df.csv")
# df2 = pd.read_csv("/kaggle/input/ethereum-transactions-for-fraud-detection/second_order_df.csv")

# Drop the "Unnamed: 0" column if it exists (usually an extra index column)
for df in [df1]:
    if 'Unnamed: 0' in df.columns:
        df.drop('Unnamed: 0', axis=1, inplace=True)

# Combine datasets into one DataFrame
# df = pd.concat([df1, df2], ignore_index=True)
df = df1

In [4]:
# -------------------------------
# 2. Preprocess the Data: Convert Unix Timestamp and Extract Time Features
# -------------------------------
# Convert the Unix TimeStamp (e.g., 1529873859) to datetime
df['TimeStamp'] = pd.to_datetime(df['TimeStamp'], unit='s')

# Extract additional time-based features
df['hour'] = df['TimeStamp'].dt.hour
df['day'] = df['TimeStamp'].dt.day
df['month'] = df['TimeStamp'].dt.month
df['year'] = df['TimeStamp'].dt.year

In [5]:
df.head()

Unnamed: 0,TxHash,BlockHeight,TimeStamp,From,To,Value,isError,hour,day,month,year
0,0xaca3850ba0080cf47b47f80e46da452f61bcbb5470d3...,5848095,2018-06-24 20:57:39,0x16f209b5332a1b4fa5bf19497ca40154c5db2f85,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.5,0,20,24,6,2018
1,0x95681862f9778e49caecf603dd911d6ed57f7799d89d...,5848181,2018-06-24 21:18:24,0xe7e07e44ee315b5f2d076340b2b7a5cc9a4ee57b,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.00102,0,21,24,6,2018
2,0x716ae3961b50186a0bbc272cfcc4555662f7fe33550f...,5848716,2018-06-24 23:33:12,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0xe892875b87b94c44edf0e91ee9f49d0525fadd83,0.50039,0,23,24,6,2018
3,0xf397197b800d6cc055a4db265b5e9df3dd2aa745c813...,5849038,2018-06-25 00:48:04,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.8178,0,0,25,6,2018
4,0x7f8086011a32f128dba57fe06fc5f4a181d2f5401e5a...,5849437,2018-06-25 02:19:04,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0xe892875b87b94c44edf0e91ee9f49d0525fadd83,0.817506,0,2,25,6,2018


In [6]:
# -------------------------------
# 3. Prepare Features and Target
# -------------------------------
# Define feature columns and the target column
features = ['BlockHeight', 'Value', 'hour', 'day', 'month', 'year', 'From', 'To']
target = 'isError'

X = df[features]
y = df[target]

In [7]:
# -------------------------------
# 4. Split the Data into Training and Testing Sets
# -------------------------------
# For example, use 90% of data for training and 10% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [8]:
# -------------------------------
# 5. Build Pipelines for Different Models and Their Ensembles
# -------------------------------

# Define the columns used in the model (same as in Step 4)
# Numeric: BlockHeight, Value, hour, day, month, year
# Categorical: From, To
num_cols = ['BlockHeight', 'Value', 'hour', 'day', 'month', 'year']
cat_cols = ['From', 'To']

# Create a preprocessor: one-hot encode the categorical features, leave numeric features untouched.
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

In [9]:
# -------------------------------
# Create Individual Classifier Pipelines
# -------------------------------
pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

pipeline_mlp = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(random_state=42))
])

pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

In [10]:
# -------------------------------
# Create Ensemble Pipelines Using VotingClassifier
# The VotingClassifier here does soft voting.
# Note: In these ensemble pipelines, the preprocessor is outside VotingClassifier;
# the base classifiers here are not pipelines themselves but raw estimators.
# They will be trained on the preprocessed features.
# -------------------------------
ensemble_rf_mlp = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('mlp', MLPClassifier(random_state=42))
        ],
        voting='soft'
    ))
])

ensemble_rf_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
        ],
        voting='soft'
    ))
])

ensemble_mlp_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', VotingClassifier(
        estimators=[
            ('mlp', MLPClassifier(random_state=42)),
            ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
        ],
        voting='soft'
    ))
])

# ensemble_rf_mlp_xgb = Pipeline([
#     ('preprocessor', preprocessor),
#     ('classifier', VotingClassifier(
#         estimators=[
#             ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
#             ('mlp', MLPClassifier(random_state=42)),
#             ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
#         ],
#         voting='soft'
#     ))
# ])

In [11]:
# -------------------------------
# Create a dictionary to hold all pipelines for convenience.
# -------------------------------
pipelines = {
    # "RF": pipeline_rf,
    # "MLP": pipeline_mlp,
    # "XGB": pipeline_xgb,
    "RF+MLP": ensemble_rf_mlp,
    "RF+XGB": ensemble_rf_xgb,
    "MLP+XGB": ensemble_mlp_xgb,
    # "RF+MLP+XGB": ensemble_rf_mlp_xgb
}

In [12]:
# -------------------------------
# 6. Train All Pipelines
# -------------------------------
for name, pipe in pipelines.items():
    print(f"Training {name} model...")
    pipe.fit(X_train, y_train)

Training RF+MLP model...
Training RF+XGB model...
Training MLP+XGB model...


In [13]:
# -------------------------------
# 7. Evaluate All Pipelines on Test Data
# -------------------------------
for name, pipe in pipelines.items():
    print(f"\nClassification Report for {name}:")
    y_pred = pipe.predict(X_test)
    print(classification_report(y_test, y_pred))


Classification Report for RF+MLP:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23886
           1       1.00      0.69      0.82      1612

    accuracy                           0.98     25498
   macro avg       0.99      0.84      0.90     25498
weighted avg       0.98      0.98      0.98     25498


Classification Report for RF+XGB:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23886
           1       0.99      0.75      0.86      1612

    accuracy                           0.98     25498
   macro avg       0.99      0.88      0.92     25498
weighted avg       0.98      0.98      0.98     25498


Classification Report for MLP+XGB:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     23886
           1       1.00      0.62      0.77      1612

    accuracy                           0.98     25498
   macro avg       0.9

In [14]:
# -------------------------------
# 8. Save Each Model Pipeline
# -------------------------------
for name, pipe in pipelines.items():
    filename = f"pipeline_{name.replace('+', '_')}.pkl"  # e.g., pipeline_RF.pkl, pipeline_RF_MLP.pkl, etc.
    with open(filename, 'wb') as f:
        pickle.dump(pipe, f)
    print(f"{name} pipeline saved as {filename}.")

RF+MLP pipeline saved as pipeline_RF_MLP.pkl.
RF+XGB pipeline saved as pipeline_RF_XGB.pkl.
MLP+XGB pipeline saved as pipeline_MLP_XGB.pkl.
