In [None]:
from google.oauth2 import service_account
from google.cloud import bigquery
from google.cloud.exceptions import NotFound
import pandas as pd
from google.cloud import storage
import matplotlib.pyplot as plt
import numpy as np
import os
from glob import glob

In [None]:
project_id = "looker-assignment-113356033"
dataset_id = "final_project_dataset"

In [None]:
crendentials = service_account.Credentials.from_service_account_file(r"C:\nccu\workspace\dataModel_finalProjrct\looker-assignment-113356033-4959799503ac.json")

In [None]:
bigquery_client = bigquery.Client(project=project_id, credentials=crendentials)
storage_client = storage.Client(credentials=crendentials, project=project_id)

In [None]:
# ============================
# 1. Data Loading and Cleaning
# ============================
def load_and_clean_data(balance_df):
    """
    Preprocess token holder data from a DataFrame.
    """
    # Remove commas and convert Balance column to numeric
    balance_df['Balance'] = balance_df['Balance'].str.replace(',', '', regex=True)
    balance_df['Balance'] = pd.to_numeric(balance_df['Balance'], errors='coerce')

    # Create "Percentage" column
    total_balance = balance_df['Balance'].sum()
    balance_df['Percentage'] = (balance_df['Balance'] / total_balance) * 100

    print(f"[INFO] Data cleaned: {balance_df.shape[0]} rows, {balance_df.shape[1]} columns")
    return balance_df

# ============================
# 2. Plot Long-tail Distribution
# ============================
def plot_long_tail_distribution(balance_df, balance_column, title):
    """
    Plot the long-tail distribution of holder balances on a log scale.
    """
    plt.figure(figsize=(12, 6))
    plt.hist(np.log10(balance_df[balance_column]), bins=100, edgecolor='black', alpha=0.7)
    plt.title(f'Distribution of Holder Balances (Log-Scale) - {title}')
    plt.xlabel('Log10(Balance)')
    plt.ylabel('Number of Holders')
    plt.show()

def plot_combined_long_tail_distribution(file_data_dict):
    """
    Plot the long-tail distribution for multiple datasets on the same plot using histograms.
    """
    plt.figure(figsize=(12, 6))

    # Plot each dataset as a histogram
    for file_name, df in file_data_dict.items():
        # Ensure no NaN or invalid values
        df = df.dropna(subset=['Balance'])
        df = df[df['Balance'] > 0]

        # Calculate Log10(Balance)
        log_balance = np.log10(df['Balance'])

        # Plot histogram
        plt.hist(
            log_balance,
            bins=100,
            alpha=0.5,
            label=file_name,
            edgecolor='black'
        )

    # Add plot details
    plt.title('Combined Long-Tail Distribution of Holder Balances (Log-Scale)')
    plt.xlabel('Log10(Balance)')
    plt.ylabel('Number of Holders')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

# ============================
# 3. Plot Cumulative Distribution Function (CDF) by Log10(Balance)
# ============================
def plot_cdf_by_log_balance(balance_df, balance_column, title):
    """
    Plot the cumulative distribution function (CDF) using Log10(Balance).
    """
    balance_df['log_balance'] = np.log10(balance_df[balance_column])
    df_sorted = balance_df.sort_values(by='log_balance', ascending=True)
    df_sorted['cumulative_percentage'] = df_sorted['Balance'].cumsum() / df_sorted['Balance'].sum() * 100

    plt.figure(figsize=(12, 6))
    plt.plot(df_sorted['log_balance'], df_sorted['cumulative_percentage'], marker='o', linestyle='-')
    plt.title(f'Cumulative Distribution of Holder Balances - {title}')
    plt.xlabel('Log10(Balance)')
    plt.ylabel('Cumulative Percentage (%)')
    plt.grid(True)
    plt.show()

def plot_combined_cdf_by_log_balance(file_data_dict):
    """
    Plot the cumulative distribution function (CDF) for multiple datasets on the same plot.
    """
    plt.figure(figsize=(12, 6))

    for file_name, df in file_data_dict.items():
        df['log_balance'] = np.log10(df['Balance'])
        df_sorted = df.sort_values(by='log_balance')
        df_sorted['cumulative_percentage'] = df_sorted['Balance'].cumsum() / df_sorted['Balance'].sum() * 100

        plt.plot(df_sorted['log_balance'], df_sorted['cumulative_percentage'],
                 marker='o', linestyle='-', label=file_name)

    plt.title('Combined CDF of Holder Balances (Log10(Balance))')
    plt.xlabel('Log10(Balance)')
    plt.ylabel('Cumulative Percentage (%)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

# ============================
# 4. Plot CDF with Address Index
# ============================
def plot_cdf_by_index(balance_df, title):
    """
    Plot the cumulative distribution function (CDF) using Address Index.
    """
    balance_df = balance_df.sort_values(by='Balance', ascending=True).reset_index(drop=True)
    balance_df['cumulative_percentage'] = balance_df['Balance'].cumsum() / balance_df['Balance'].sum() * 100

    plt.figure(figsize=(12, 6))
    plt.plot(balance_df.index, balance_df['cumulative_percentage'], marker='o', linestyle='-')
    plt.title(f'Cumulative Distribution of Holder Balances by Index - {title}')
    plt.xlabel('Address Index')
    plt.ylabel('Cumulative Percentage (%)')
    plt.grid(True)
    plt.show()

def plot_combined_cdf_by_index(file_data_dict):
    """
    Plot the cumulative distribution function (CDF) by Address Index for multiple datasets.
    """
    plt.figure(figsize=(12, 6))

    for file_name, df in file_data_dict.items():
        df_sorted = df.sort_values(by='Balance').reset_index(drop=True)
        df_sorted['cumulative_percentage'] = df_sorted['Balance'].cumsum() / df_sorted['Balance'].sum() * 100

        plt.plot(df_sorted.index, df_sorted['cumulative_percentage'],
                 marker='o', linestyle='-', label=file_name)

    plt.title('Combined CDF of Holder Balances by Address Index')
    plt.xlabel('Address Index')
    plt.ylabel('Cumulative Percentage (%)')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.show()

# ============================
# 5. Process Multiple Files
# ============================
def process_multiple_files(data_frames):
    """
    Process and visualize multiple token holder datasets.
    """
    for idx, balance_df in enumerate(data_frames):
        try:
            title = f"Dataset {idx + 1}"

            # Clean data
            balance_df = load_and_clean_data(balance_df)

            # Define balance column
            balance_column = 'Balance'

            # Generate plots
            plot_long_tail_distribution(balance_df, balance_column, title)
            plot_cdf_by_log_balance(balance_df, balance_column, title)
            plot_cdf_by_index(balance_df, title)

        except Exception as e:
            print(f"[ERROR] Failed to process dataset {idx + 1}: {e}")

def process_combined_multiple_files(data_frames):
    """
    Process multiple token holder datasets and generate combined plots.
    """
    file_data_dict = {}

    for idx, balance_df in enumerate(data_frames):
        try:
            title = f"Dataset {idx + 1}"
            balance_df = load_and_clean_data(balance_df)
            file_data_dict[title] = balance_df

        except Exception as e:
            print(f"[ERROR] Failed to process dataset {idx + 1}: {e}")

    # Plot combined visualizations
    if file_data_dict:
        plot_combined_long_tail_distribution(file_data_dict)
        plot_combined_cdf_by_log_balance(file_data_dict)
        plot_combined_cdf_by_index(file_data_dict)

# ============================
# 6. Main Function
# ============================
def main():
    # Query token holder data
    Bonk_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Bonk_top200_holders`
    """
    Bonk_holders_data = bigquery_client.query(Bonk_query).to_dataframe()

    FLOKI_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.FLOKI_top200_holders`
    """
    FLOKI_holders_data = bigquery_client.query(FLOKI_query).to_dataframe()

    Mog_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Mog_top200_holders`
    """
    Mog_holders_data = bigquery_client.query(Mog_query).to_dataframe()

    Pepe_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Pepe_top200_holders`
    """
    Pepe_holders_data = bigquery_client.query(Pepe_query).to_dataframe()

    Shib_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Shib_top200_holders`
    """
    Shib_holders_data = bigquery_client.query(Shib_query).to_dataframe()

    holder_data = [
        Bonk_holders_data,
        FLOKI_holders_data,
        Mog_holders_data,
        Pepe_holders_data,
        Shib_holders_data
    ]

    # Process datasets
    process_combined_multiple_files(holder_data)

if __name__ == "__main__":
    main()

In [None]:
# ============================
# Entry Point
# ============================
if __name__ == '__main__':
    main()


[INFO] Data loaded and cleaned from data/top_200_bonk_top_holders.csv: 200 rows, 4 columns
[INFO] Data loaded and cleaned from data/top_200_floki_top_holders.csv: 200 rows, 4 columns
[INFO] Data loaded and cleaned from data/top_200_mog_top_holders.csv: 200 rows, 4 columns
[INFO] Data loaded and cleaned from data/top_200_pepe_top_holders.csv: 200 rows, 4 columns
[INFO] Data loaded and cleaned from data/top_200_shib_top_holders.csv: 200 rows, 4 columns
[INFO] Plot saved: plots\long_tail.png
[INFO] Plot saved: plots\cdf_log_balance.png
[INFO] Plot saved: plots\cdf_index.png


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def calculate_balance_percentage(df):
    """
    Calculate the balance percentage for a given DataFrame.

    Args:
        df (pd.DataFrame): Input DataFrame containing 'Balance Amount'.

    Returns:
        pd.DataFrame: DataFrame with an additional 'Balance percentage' column.
    """
    # Ensure 'Balance Amount' is numeric
    df['Balance Amount'] = pd.to_numeric(
        df['Balance Amount'].astype(str).str.replace(',', '', regex=True),
        errors='coerce'
    )

    # Drop rows with invalid 'Balance Amount'
    df = df.dropna(subset=['Balance Amount'])

    # Calculate 'Balance percentage'
    total_balance = df['Balance Amount'].sum()
    if total_balance > 0:
        df['Balance percentage'] = (df['Balance Amount'] / total_balance) * 100
    else:
        df['Balance percentage'] = 0.0

    return df

def plot_combined_long_tail_distribution(holders_data, names):
    """
    Plot the combined long-tail distribution for multiple datasets.

    Args:
        holders_data (list of pd.DataFrame): List of DataFrames containing holder data.
        names (list of str): List of dataset names for labeling.
    """
    plt.figure(figsize=(12, 6))

    for df, name in zip(holders_data, names):
        # Calculate balance percentage
        df = calculate_balance_percentage(df)

        # Remove invalid data
        df = df[df['Balance percentage'] > 0]

        # Avoid log(0) by adding a small constant
        df['Adjusted Percentage'] = df['Balance percentage'] + 1e-6

        # Calculate normalized weights
        weights = np.ones(len(df['Adjusted Percentage'])) / len(df['Adjusted Percentage'])

        # Plot histogram
        plt.hist(
            df['Adjusted Percentage'],
            bins=np.logspace(
                np.log10(df['Adjusted Percentage'].min()),
                np.log10(df['Adjusted Percentage'].max()),
                100
            ),
            weights=weights,
            alpha=0.5,
            label=name,
            edgecolor='black'
        )

    # Set log scale for the x-axis
    plt.xscale('log')

    # Add title and labels
    plt.title('Combined Long-Tail Distribution of Holder Percentages (Normalized Y-axis)')
    plt.xlabel('Balance Percentage (Log Scale)')
    plt.ylabel('Proportion of Holders')
    plt.legend()
    plt.grid(True, which='both', linestyle='--', alpha=0.5)

    # Display the plot
    plt.show()

def main():
    # Example data
    Bonk_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Bonk_holders`
    """
    Bonk_holders_data = bigquery_client.query(Bonk_query).to_dataframe()

    FLOKI_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.FLOKI_holders`
    """
    FLOKI_holders_data = bigquery_client.query(FLOKI_query).to_dataframe()

    Mog_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Mog_holders`
    """
    Mog_holders_data = bigquery_client.query(Mog_query).to_dataframe()

    Pepe_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Pepe_holders`
    """
    Pepe_holders_data = bigquery_client.query(Pepe_query).to_dataframe()

    Shib_query = """
    SELECT
        *
    FROM `looker-assignment-113356033.final_project_dataset.Shib_holders`
    """
    Shib_holders_data = bigquery_client.query(Shib_query).to_dataframe()

    holders_data = [
        Bonk_holders_data,
        FLOKI_holders_data,
        Mog_holders_data,
        Pepe_holders_data,
        Shib_holders_data
    ]

    dataset_names = ['Bonk', 'FLOKI', 'Mog', 'Pepe', 'Shib']

    # Plot combined long-tail distribution
    plot_combined_long_tail_distribution(holders_data, dataset_names)

In [None]:
# ============================
# Entry Point
# ============================
if __name__ == '__main__':
    main()

[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\floki_token_holders.xlsx with 88432 rows and 6 columns.
[INFO] Combined DataFrame shape: (88432, 6)
[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\bonk_token_holders.xlsx with 14105 rows and 6 columns.
[INFO] Combined DataFrame shape: (14105, 6)
[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\mog_token_holders.xlsx with 48424 rows and 6 columns.
[INFO] Combined DataFrame shape: (48424, 6)
[INFO] Found 15 files. Loading...
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part1.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part10.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part11.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part12.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part

In [None]:
# Example: Load data from BigQuery and process
file_data_dict = {}

# Queries for each token holder dataset
queries = {
    "floki": """
        SELECT * 
        FROM `looker-assignment-113356033.final_project_dataset.FLOKI_holders`
    """,
    "bonk": """
        SELECT * 
        FROM `looker-assignment-113356033.final_project_dataset.Bonk_holders`
    """,
    "mog": """
        SELECT * 
        FROM `looker-assignment-113356033.final_project_dataset.Mog_holders`
    """,
    "shib": """
        SELECT * 
        FROM `looker-assignment-113356033.final_project_dataset.Shib_holders`
    """,
    "pepe": """
        SELECT * 
        FROM `looker-assignment-113356033.final_project_dataset.Pepe_holders`
    """
}

# Load data for each token holder and store in the dictionary
for token_name, query in queries.items():
    try:
        # Execute the query and load data into a DataFrame
        df = bigquery_client.query(query).to_dataframe()

        # Ensure 'Balance Amount' is numeric
        df['Balance Amount'] = pd.to_numeric(
            df['Balance Amount'].astype(str).str.replace(',', '', regex=True),
            errors='coerce'
        )
        
        # Drop rows with missing or invalid 'Balance Amount'
        df = df.dropna(subset=['Balance Amount'])

        # Calculate 'Balance percentage'
        total_balance = df['Balance Amount'].sum()
        if total_balance > 0:
            df['Balance percentage'] = (df['Balance Amount'] / total_balance) * 100
        else:
            df['Balance percentage'] = 0.0

        # Add the processed DataFrame to the dictionary
        file_data_dict[token_name] = df
        print(f"[INFO] Loaded and processed data for {token_name} with {df.shape[0]} rows.")

    except Exception as e:
        print(f"[ERROR] Failed to load data for {token_name}: {e}")

[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\floki_token_holders.xlsx with 88432 rows and 6 columns.
[INFO] Combined DataFrame shape: (88432, 6)
[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\bonk_token_holders.xlsx with 14105 rows and 6 columns.
[INFO] Combined DataFrame shape: (14105, 6)
[INFO] Found 1 files. Loading...
[INFO] Loaded data/token_holders/all\mog_token_holders.xlsx with 48424 rows and 6 columns.
[INFO] Combined DataFrame shape: (48424, 6)
[INFO] Found 15 files. Loading...
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part1.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part10.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part11.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part12.xlsx with 100000 rows and 6 columns.
[INFO] Loaded data/token_holders/all\SHIB_token_holders_part

In [None]:
for file_name, df in file_data_dict.items():
    plot_long_tail_distribution(df, "Balance percentage", file_name)

[INFO] Plot saved: plots\floki_long_tail_percentage_log.png
[INFO] Plot saved: plots\bonk_long_tail_percentage_log.png


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_df['Adjusted Percentage'] = balance_df[percentage_column] + 1e-6  # 避免 log(0)


[INFO] Plot saved: plots\mog_long_tail_percentage_log.png


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_df['Adjusted Percentage'] = balance_df[percentage_column] + 1e-6  # 避免 log(0)


[INFO] Plot saved: plots\SHIB_long_tail_percentage_log.png


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  balance_df['Adjusted Percentage'] = balance_df[percentage_column] + 1e-6  # 避免 log(0)


[INFO] Plot saved: plots\PEPE_long_tail_percentage_log.png


In [None]:
plot_combined_long_tail_distribution(file_data_dict)

[INFO] Plot saved: plots\combined_long_tail_percentage_normalized_long_tail_percentage_normalized.png
