In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb




In [2]:
metaverse_df = pd.read_csv('metaverse_transactions_dataset.csv')

# Drop unnecessary columns
drop_columns = ['timestamp', 'sending_address', 'receiving_address']
metaverse_df = metaverse_df.drop(drop_columns, axis=1)

In [3]:
object_columns = ['transaction_type', 'location_region', 'purchase_pattern', 'age_group']
for col in object_columns:
    le = LabelEncoder()
    metaverse_df[col] = le.fit_transform(metaverse_df[col])

    # Encode the target variable
le_target = LabelEncoder()
metaverse_df['anomaly'] = le_target.fit_transform(metaverse_df['anomaly'])


# Define features and target
X = metaverse_df.drop('anomaly', axis=1)
y = metaverse_df['anomaly']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
X_test.shape

(15720, 10)

In [4]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVC': SVC(decision_function_shape='ovo'),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'KNeighbors': KNeighborsClassifier(),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42),
    'Bagging': BaggingClassifier(random_state=42),
    'XGB': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'),
    'LightGBM': lgb.LGBMClassifier(learning_rate=0.01, n_estimators=100, random_state=42)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    print(f"{name} - Accuracy: {accuracy}")
    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Classification Report:\n{class_report}")



RandomForest - Accuracy: 1.0
Confusion Matrix:
[[ 1251     0     0]
 [    0 12848     0]
 [    0     0  1621]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1251
           1       1.00      1.00      1.00     12848
           2       1.00      1.00      1.00      1621

    accuracy                           1.00     15720
   macro avg       1.00      1.00      1.00     15720
weighted avg       1.00      1.00      1.00     15720



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression - Accuracy: 0.9881679389312977
Confusion Matrix:
[[ 1246     0     5]
 [    3 12761    84]
 [    8    86  1527]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1251
           1       0.99      0.99      0.99     12848
           2       0.94      0.94      0.94      1621

    accuracy                           0.99     15720
   macro avg       0.98      0.98      0.98     15720
weighted avg       0.99      0.99      0.99     15720

SVC - Accuracy: 0.994529262086514
Confusion Matrix:
[[ 1251     0     0]
 [    0 12762    86]
 [    0     0  1621]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1251
           1       1.00      0.99      1.00     12848
           2       0.95      1.00      0.97      1621

    accuracy                           0.99     15720
   macro avg       0.98      1.00      0.99     15720

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


XGB - Accuracy: 1.0
Confusion Matrix:
[[ 1251     0     0]
 [    0 12848     0]
 [    0     0  1621]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1251
           1       1.00      1.00      1.00     12848
           2       1.00      1.00      1.00      1621

    accuracy                           1.00     15720
   macro avg       1.00      1.00      1.00     15720
weighted avg       1.00      1.00      1.00     15720

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 483
[LightGBM] [Info] Number of data points in the train set: 62880, number of used features: 10
[LightGBM] [Info] Start training from score -2.484144
[LightGBM] [Info] Start training from score -0.216368
[LightGBM] [Info] Start training f