# Packages

In [None]:
import pandas as pd
import numpy as np
import os

pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None) # show full width of showing cols

# Functions

In [None]:
def make_identifier(df):
    str_id = df.apply(lambda x: '_'.join(map(str, x)), axis=1)
    return pd.factorize(str_id)[0]
# Apply like:
# dftestMaster['UID_test'] = make_identifier(dftestMaster[['BatchNr','attack_Id']])

# Set base variables

In [None]:
counter = 1
counter_save = 1

# Load new file

In [None]:
df1 = pd.read_parquet(f"/Volumes/Extreme SSD/02_UniswapV2Transactions/UniswapV2Transactions_{counter}.par")
counter = counter + 1


# Convert data

## Convert gasprice and Value to float

In [None]:
df1['trans_gasPrice_Gwei'] = (df1['trans_gasPrice']/(10**9))
df1['value_float'] = (df1['value'].astype(float)/(10**18)) 

# Build loop (1 observation per transaction)

In [None]:
counter = 1
counter_save = 1

while True:
    
    # Add control ouputs
    print(counter,
          counter_save)
    
    try:
        df1 = pd.read_parquet(f"/Volumes/Extreme SSD/02_UniswapV2Transactions/UniswapV2Transactions_{counter}.par")
    except:
        print("Finished!")
        break           
    else:
        # Convert gasprice and Value to float
        df1['trans_gasPrice_Gwei'] = (df1['trans_gasPrice']/(10**9))
        df1['value_float'] = (df1['value'].astype(float)/(10**18))
        
        # Heuristics 1,4,6a
        df1['Count_1']=df1.groupby(['block_number','poolId', 'value'],as_index=True)['value'].transform('size').astype('float') 

        df_filtered = df1.groupby(['block_number','poolId', 'transaction_hash'],as_index=True).filter(lambda x: x['Count_1'].mean() > 1)
        
        # Heuristics 3 & 7
        # Add column with unique count of transaction_hash per sub-frame
        df_filtered['Count_nHash']=df_filtered.groupby(['block_number','poolId', 'value'],as_index=True)['transaction_hash'].transform('nunique').astype('float')

        # filtering 'Count_nHash'-mean == 1.5 (formerly >1) per sub-frame by 'transaction_hash' means that there are exact 2 (fromaerly at least 2) different transaction_hashes, which is heuristic nr. 3
        df_filtered2 = df_filtered.groupby(['transaction_hash'],as_index=True).filter(lambda x: x['Count_nHash'].mean()  == 1.5)
        
        # Heuristic 5
        df_filtered2['within_transaction_order'] = df_filtered2.sort_values(['log_index'],ascending=True).groupby(['transaction_hash'],as_index=True).cumcount().add(1)


        # filtering for mean values == 1.5 (formerly >= 1.5) for within_transaction_order after groupby to ensure Heuristic 5 (transaction in oposite transaction per block, LP, and token_address
        df_filtered3 = df_filtered2[df_filtered2.duplicated(subset=['block_number', 'poolId', 'tf_tokenAddress', 'value'],keep = False)].groupby(['block_number', 'poolId', 'tf_tokenAddress', 'value'],as_index=True).filter(lambda x: x['within_transaction_order'].mean() == 1.5)

        # Adding an attack-ID
        df_filtered3['attack_Id'] = df_filtered3.groupby(['block_number', 'poolId', 'value'],as_index=True).ngroup()
        
        # Adding batch number
        df_filtered3['BatchNr'] = counter

              
        
    
    # Check if first loop
    if os.path.exists(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par"):
        # Save seperate file with counter number (100k blocks)
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_{counter_save}.par")
    
        # Add to Master file
        dfMaster = pd.read_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par")
        dfMaster = pd.concat([dfMaster, df_filtered3], ignore_index=True)
        # Add overall attack Id
        dfMaster['Attack_UID'] = dfMaster.groupby(['BatchNr','attack_Id']).ngroup()
        dfMaster.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par")
        
        # Clean/update variables
        counter = counter + 1
        counter_save = counter_save + 1
        del df1
        del df_filtered
        del df_filtered2
        del df_filtered3
        del dfMaster
    

    else:
        
        dfMaster = df_filtered3
        # Add overall attack Id
        dfMaster['Attack_UID'] = dfMaster.groupby(['BatchNr','attack_Id']).ngroup()
        # Save Master
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par")
        # Save first file
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_{counter_save}.par")
        
        # Clean/update variables
        counter = counter + 1
        counter_save = counter_save + 1
        del df1
        del df_filtered
        del df_filtered2
        del df_filtered3
        del dfMaster
        
        
    
    
    

# Checks

In [None]:
dftest1 = pd.read_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_1.par")

In [None]:
dftest2 = pd.read_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_3.par")

In [None]:
dftestMaster = pd.read_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par")

In [None]:
len(dftest1)

In [None]:
len(dftest2)

In [None]:
len(dftestMaster)

In [None]:
dftestMaster[dftestMaster['BatchNr'] ==18]

In [None]:
dftestMaster['Attack_UID'] = dftestMaster.groupby(['BatchNr','attack_Id']).ngroup()

In [None]:
dftestMaster = dftestMaster.drop(['UID','UID_test'],axis =1)

In [None]:
dftestMaster.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_Master.par")

# Build loop (ALL observations per transaction)

In [None]:
counter = 1
counter_save = 1

while True:
    
    # Add control ouputs
    print(counter,
          counter_save)
    
    try:
        df1 = pd.read_parquet(f"/Volumes/Extreme SSD/02_UniswapV2Transactions/UniswapV2Transactions_{counter}.par")
    except:
        print("Finished!")
        break           
    else:
        # Convert gasprice and Value to float
        df1['trans_gasPrice_Gwei'] = (df1['trans_gasPrice']/(10**9))
        df1['value_float'] = (df1['value'].astype(float)/(10**18))
        
        # Heuristics 1,4,6
        df1['Count_1']=df1.groupby(['block_number','poolId', 'value'],as_index=True)['value'].transform('size').astype('float') 

        df_filtered = df1.groupby(['block_number','poolId', 'transaction_hash'],as_index=True).filter(lambda x: x['Count_1'].mean() > 1)
        
        # Heuristics 3 & 7
        # Add column with unique count of transaction_hash per sub-frame
        df_filtered['Count_nHash']=df_filtered.groupby(['block_number','poolId', 'value'],as_index=True)['transaction_hash'].transform('nunique').astype('float')

        # filtering 'Count_nHash'-mean == 1.5 (formerly >1) per sub-frame by 'transaction_hash' means that there are exact 2 (fromaerly at least 2) different transaction_hashes, which is heuristic nr. 3
        df_filtered2 = df_filtered.groupby(['transaction_hash'],as_index=True).filter(lambda x: x['Count_nHash'].mean()  == 1.5)
        
        # Heuristic 5
        df_filtered2['within_transaction_order'] = df_filtered2.sort_values(['log_index'],ascending=True).groupby(['transaction_hash'],as_index=True).cumcount().add(1)
        
        AttackHashes = df_filtered2[df_filtered2.duplicated(subset=['block_number', 'poolId', 'tf_tokenAddress', 'value'],keep = False)].groupby(['block_number', 'poolId', 'tf_tokenAddress', 'value'],as_index=True).filter(lambda x: x['within_transaction_order'].mean() == 1.5)
        AttackHashes = AttackHashes['transaction_hash'].unique()
        df_filtered3 = df_filtered2[df_filtered2['transaction_hash'].isin(AttackHashes)]
        
        ##### Adding an attack-ID
        # Adding an attack-ID to df with 1 observation per transaction
        df_attacksSingleObs = df_filtered2[df_filtered2.duplicated(subset=['block_number', 'poolId', 'tf_tokenAddress', 'value'],keep = False)].groupby(['block_number', 'poolId', 'tf_tokenAddress', 'value'],as_index=True).filter(lambda x: x['within_transaction_order'].mean() == 1.5)
        df_attacksSingleObs['attack_Id'] = df_attacksSingleObs.groupby(['block_number', 'poolId', 'value'],as_index=True).ngroup()
        
        # Merge with full df
        df_filtered3 = pd.merge(df_filtered3,df_attacksSingleObs[['transaction_hash','attack_Id']],on='transaction_hash', how='left').drop_duplicates()

        
        # Adding batch number
        df_filtered3['BatchNr'] = counter

              
        
    
    # Check if first loop
    if os.path.exists(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_Master.par"):
        # Save seperate file with counter number (100k blocks)
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_{counter_save}.par")
    
        # Add to Master file
        dfMaster = pd.read_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_Master.par")
        dfMaster = pd.concat([dfMaster, df_filtered3], ignore_index=True)
        # Add overall attack Id
        dfMaster['Attack_UID'] = dfMaster.groupby(['BatchNr','attack_Id']).ngroup()
        dfMaster.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_Master.par")
        
        # Clean/update variables
        counter = counter + 1
        counter_save = counter_save + 1
        del df1
        del df_filtered
        del df_filtered2
        del df_filtered3
        del dfMaster
        del AttackHashes
        del df_attacksSingleObs
    

    else:
        
        dfMaster = df_filtered3
        # Add overall attack Id
        dfMaster['Attack_UID'] = dfMaster.groupby(['BatchNr','attack_Id']).ngroup()
        # Save Master
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_Master.par")
        # Save first file
        df_filtered3.to_parquet(f"/Volumes/Extreme SSD/98_Output/AttacksUniswapV2_full_{counter_save}.par")
        
        # Clean/update variables
        counter = counter + 1
        counter_save = counter_save + 1
        del df1
        del df_filtered
        del df_filtered2
        del df_filtered3
        del dfMaster
        del AttackHashes
        del df_attacksSingleObs
        
        
    
    
    