In [None]:
!git clone https://github.com/Qin-sx/sybil_address_recognition.git

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!mv /content/sybil_address_recognition /content/sybil_demo_pipeline
import os
os.chdir('/content/sybil_demo_pipeline')

In [None]:
!mkdir data

In [None]:
!unzip /content/drive/MyDrive/contest/wallet_risk_score.zip -d /content/sybil_demo_pipeline/data/wallet_risk_score
!mkdir /content/sybil_demo_pipeline/data/raw_data
!mv /content/sybil_demo_pipeline/data/wallet_risk_score/*  /content/sybil_demo_pipeline/data/raw_data/

In [None]:
!mkdir data/features
!mv /content/sybil_demo_pipeline/data/raw_data/train_addresses.parquet /content/sybil_demo_pipeline/data/raw_data/train_dataset.parquet
!mv /content/sybil_demo_pipeline/data/raw_data/test_addresses.parquet /content/sybil_demo_pipeline/data/raw_data/test_dataset.parquet

In [None]:
import os
import pandas as pd

def get_all_candidates(file_path):
    train_datafile = os.path.join(file_path, "train_dataset.parquet")
    test_datafile = os.path.join(file_path, "test_dataset.parquet")
    train_df = pd.read_parquet(train_datafile)
    test_df = pd.read_parquet(test_datafile)
    train_addresses = train_df['ADDRESS']
    test_addresses = test_df['ADDRESS']
    all_addresses = pd.concat([train_addresses, test_addresses], ignore_index=True)
    all_addresses_df = pd.DataFrame(all_addresses, columns=['ADDRESS'])
    return all_addresses_df

def get_transaction_partners(transactions_df, address_df, columns=['FROM_ADDRESS', 'TO_ADDRESS']):
    # Initialize a dictionary to store the transaction partners for each address
    address_partners = {address: set() for address in address_df['ADDRESS']}

    # Iterate over each row in the transactions_df
    for _, row in transactions_df.iterrows():
        from_address = row[columns[0]]
        to_address = row[columns[1]]

        # Add the to_address to the from_address's partner set and vice versa
        if from_address in address_partners:
            address_partners[from_address].add(to_address)
        if to_address in address_partners:
            address_partners[to_address].add(from_address)

    # Convert the dictionary to a DataFrame
    result_df = pd.DataFrame({
        'ADDRESS': list(address_partners.keys()),
        'PARTNERS': [list(partners) for partners in address_partners.values()]
    })

    return result_df

def save_unique_partners(transaction_partners_df, address_df, output_path):
    # Extract all partners
    all_partners = set()
    for partners in transaction_partners_df['PARTNERS']:
        if partners is not None:  # Check for NULL values
            all_partners.update(partners)

    # Remove addresses that are in address_df['ADDRESS']
    address_set = set(address_df['ADDRESS'])
    unique_partners = all_partners - address_set

    # Convert to DataFrame
    unique_partners_df = pd.DataFrame(list(unique_partners), columns=['ADDRESS'])

    # Remove rows with NULL values in ADDRESS column
    unique_partners_df.dropna(subset=['ADDRESS'], inplace=True)

    # Merge PARTNERS from transaction_partners_df based on ADDRESS
    merged_df = unique_partners_df.merge(transaction_partners_df[['ADDRESS', 'PARTNERS']], on='ADDRESS', how='left')

    # Save to parquet file
    output_file = os.path.join(output_path, "partners.parquet")
    merged_df.to_parquet(output_file, index=False)

def main(file_path, output_path):
    # Get all addresses
    address_df = get_all_candidates(file_path)

    # Read the transactions data
    transactions_datafile = os.path.join(file_path, "transactions.parquet")
    transactions_df = pd.read_parquet(transactions_datafile)

    # Get transaction partners
    transaction_partners_df = get_transaction_partners(transactions_df, address_df, columns=['FROM_ADDRESS', 'TO_ADDRESS'])

    # Save the transaction partners to a .parquet file
    output_file = os.path.join(output_path, "transaction_partners.parquet")
    transaction_partners_df.to_parquet(output_file, index=False)

    # Save unique partners to a separate .parquet file
    save_unique_partners(transaction_partners_df, address_df, output_path)

if __name__ == "__main__":
    file_path = "./data/raw_data/"
    output_path = "./data/features/"
    main(file_path, output_path)

In [None]:
import pandas as pd
import os

# 指定文件夹路径
folder_path = '/content/sybil_demo_pipeline/data/raw_data/token_transfers'

# 加载 partners.parquet 文件
partners_path = '/content/sybil_demo_pipeline/data/features/partners.parquet'  # 假设 partners.parquet 文件路径
partners_df = pd.read_parquet(partners_path)

# 获取 partners.parquet 中的 ADDRESS 列
partners_addresses = partners_df['ADDRESS'].dropna().unique()

# 获取所有 token_transfers 的 parquet 文件
files = [f for f in os.listdir(folder_path) if f.startswith('token_transfers.parquet')]

# 读取并合并文件，同时进行过滤
df_list = []
for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_parquet(file_path)
    # 过滤出只有地址在 partners_addresses 中的记录
    filtered_df = df[
        df['FROM_ADDRESS'].isin(partners_addresses) |
        df['TO_ADDRESS'].isin(partners_addresses) |
        df['ORIGIN_FROM_ADDRESS'].isin(partners_addresses) |
        df['ORIGIN_TO_ADDRESS'].isin(partners_addresses)
    ]
    if not filtered_df.empty:
        df_list.append(filtered_df)

# 合并过滤后的数据
if df_list:
    merged_df = pd.concat(df_list, ignore_index=True)
    # 保存合并后的文件
    merged_df.to_parquet('/content/sybil_demo_pipeline/data/raw_data/token_transfers.parquet', index=False)
else:
    print("No data to save after filtering.")

In [None]:
!mkdir ./saved_model

In [None]:
!python feature_process_2.py

In [None]:
!mv /content/sybil_demo_pipeline/data/features/transactions_feature_partner.parquet /content/sybil_demo_pipeline/data/features/transactions_feature.parquet

In [None]:
!pip install catboost

In [None]:
!python train2.py

In [None]:
!python inference2.py

In [None]:
import os
os.chdir('/content')

In [None]:
!zip -r sybil_demo_pipeline.zip ./sybil_demo_pipeline -x "./sybil_demo_pipeline/data/*" -x "./sybil_demo_pipeline/saved_model/*"