In [1]:
import os
import glob
from pyspark.sql import SparkSession

# Caminho da pasta de saída
output_folder = "../Scripts/output"

# Obter lista de arquivos CSV de blocos
block_files = glob.glob(os.path.join(output_folder, "block", "*.csv"))

# Obter lista de arquivos CSV de swaps
swap_files = glob.glob(os.path.join(output_folder, "swap", "*.csv"))

# Obter lista de arquivos CSV de transações
transaction_files = glob.glob(os.path.join(output_folder, "transaction", "*.csv"))

In [2]:
# Criar sessão Spark

import os
import sys

output_folder = "../Scripts/output"

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
    .appName("Análise de Transações Ethereum") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

In [3]:
# Criar um DataFrame vazio para armazenar os dados dos blocos
combined_block_df = None

file_path = "../Scripts/output/block/block.csv"

# Ler o arquivo CSV
combined_block_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [4]:
combined_block_df = combined_block_df.drop("Gas_Limit")
combined_block_df = combined_block_df.drop("Gas_Used")
combined_block_df = combined_block_df.drop("Timestamp_Block")

In [5]:
# Criar um DataFrame vazio para armazenar os dados dos blocos
combined_swap_df = None

file_path = "../Scripts/output/swap/swap_transactions.csv"

# Ler o arquivo CSV
combined_swap_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [6]:
combined_swap_df = combined_swap_df.drop("Block_ID")

In [7]:
combined_transaction_df = None

file_path = "../Scripts/output/transaction/transactions.csv"

# Ler o arquivo CSV
combined_transaction_df = spark.read.csv(file_path, header=True, inferSchema=True)

In [8]:
combined_swap_df_cleaned = combined_swap_df.dropna(subset=['From_Token_Symbol', 'To_Token_Symbol', 'From_Token_Holders_Count', 'To_Token_Holders_Count', 'From_Token_Price', 'To_Token_Price'])

from pyspark.sql.functions import when, col

combined_swap_df_cleaned = combined_swap_df_cleaned.withColumn("From_Token_Price", when(col("From_Token_Price") == 0, float('nan')).otherwise(col("From_Token_Price")))
combined_swap_df_cleaned = combined_swap_df_cleaned.withColumn("To_Token_Price", when(col("To_Token_Price") == 0, float('nan')).otherwise(col("To_Token_Price")))
df = combined_block_df.join(combined_transaction_df, "Block_ID").join(combined_swap_df_cleaned, "Hash_Transaction")

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import pandas as pd
import numpy as np

# Função para pré-processar os dados e criar as janelas de entrada
def prepare_data(df, window_size):
    data = df.values
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i+window_size])
        y.append(data[i+window_size])
    return np.array(X), np.array(y)

# Filtrar os dados para a criptomoeda mais negociada (exemplo: Ethereum)
most_traded_coin = "WETH"
df_filtered = df.filter(col("To_Token_Symbol") == most_traded_coin)

# Converter a coluna de timestamp para formato de data
df_filtered = df_filtered.withColumn("Date", to_timestamp(col("Timestamp_Transaction")))

# Selecionar apenas as colunas relevantes (data e preço)
df_selected = df_filtered.select("Date", "To_Token_Price")

# Converter o DataFrame para Pandas para poder utilizar o TensorFlow
df_pandas = df_selected.toPandas()


# Pré-processamento dos dados
# Converter a coluna de data para o formato esperado pelo modelo LSTM
df_pandas["Date"] = pd.to_datetime(df_pandas["Date"])

# Normalizar os valores do preço
max_price = df_pandas["To_Token_Price"].max()
min_price = df_pandas["To_Token_Price"].min()
df_pandas["To_Token_Price"] = (df_pandas["To_Token_Price"] - min_price) / (max_price - min_price)

# Dividir os dados em conjunto de treinamento e teste
train_size = int(len(df_pandas) * 0.8)
train_data = df_pandas[:train_size]
test_data = df_pandas[train_size:]

# Preparar os dados para treinamento do modelo LSTM
window_size = 10  # Tamanho da janela de entrada
X_train, y_train = prepare_data(train_data["To_Token_Price"], window_size)
X_test, y_test = prepare_data(test_data["To_Token_Price"], window_size)



# Construir o modelo LSTM
model = Sequential()
model.add(LSTM(100, input_shape=(window_size, 1)))
model.add(Dense(1))
model.compile(loss="mean_squared_error", optimizer="adam")

# Treinar o modelo LSTM
try:
    model.fit(X_train, y_train, epochs=20, batch_size=32)
except:
    print("Não há dados suficientes para fazer as previsões. Exception")

# Fazer previsões para as próximas 10 horas
if len(X_test) >= 5:
    last_window = np.expand_dims(X_test[-1], axis=0)
    predictions = []
    for _ in range(10):
        next_prediction = model.predict(last_window)
        predictions.append(next_prediction[0][0])
        last_window = np.roll(last_window, -1)
        last_window[-1] = next_prediction

    # Desnormalizar os valores das previsões
    predictions = (np.array(predictions) * (max_price - min_price)) + min_price

    # Imprimir as previsões
    print("Previsões de preço para as próximas 10 horas:")
    print(predictions)
else:
    print("Não há dados suficientes para fazer as previsões.")

  series = series.astype(t, copy=False)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Previsões de preço para as próximas 10 horas:
[1854.7739 1856.1749 1857.2875 1858.1709 1858.8722 1859.4291 1859.8702
 1860.2208 1860.4988 1860.7198]


In [10]:
from pyspark.sql import functions as F

# Filtrar as transações de compra
compras_df = df.filter(df.To_Token_Symbol.isNotNull()) \
               .groupBy("To_Token_Symbol") \
               .agg(F.count("*").alias("Num_Compras")) \
               .orderBy(F.desc("Num_Compras")) \
               .limit(10)

# Filtrar as transações de venda
vendas_df = df.filter(df.From_Token_Symbol.isNotNull()) \
              .groupBy("From_Token_Symbol") \
              .agg(F.count("*").alias("Num_Vendas")) \
              .orderBy(F.desc("Num_Vendas")) \
              .limit(10)

# Exibir as 10 moedas mais compradas
print("As 10 moedas mais compradas:")
compras_df.show()

# Exibir as 10 moedas mais vendidas
print("As 10 moedas mais vendidas:")
vendas_df.show()

As 10 moedas mais compradas:
+---------------+-----------+
|To_Token_Symbol|Num_Compras|
+---------------+-----------+
|           WETH|       3608|
|         PEPINU|        248|
|         sFLOKI|        231|
|              X|        230|
|          sPEPE|        173|
|           ZUCC|        165|
|      TURBO_INU|        142|
|         WOJINU|        132|
|           LIZA|        120|
|        PEPNEKO|        114|
+---------------+-----------+

As 10 moedas mais vendidas:
+-----------------+----------+
|From_Token_Symbol|Num_Vendas|
+-----------------+----------+
|             WETH|      3766|
|           PEPINU|       231|
|           sFLOKI|       180|
|                X|       165|
|           WOJINU|       150|
|             ZUCC|       131|
|            sPEPE|       128|
|           BOBINU|       125|
|        TURBO_INU|       115|
|            PEPIG|       101|
+-----------------+----------+

