In [None]:
# Load and inspect raw dataframes to understand column structure
import pandas as pd
import glob
import os
import random  # Added for random sampling
from pprint import pprint

# Configure these to match your environment
RAW_DATA_DIR = "/Users/loyd/draftking/apps/machine-learning/data/raw_azure"
NUM_FILES_TO_CHECK = 3  # Number of files to examine


# Function to load data from parquet
def load_data(file_path):
    return pd.read_parquet(file_path)


# Get list of parquet files
input_files = glob.glob(os.path.join(RAW_DATA_DIR, "*.parquet"))

print(f"Found {len(input_files)} parquet files")

# Dictionary to track findings
file_findings = {}

# Check specific columns we're interested in
columns_to_check = [
    "totalGold_at_900000_TOP_100",
    "deaths_at_900000_TOP_100",
    "creepScore_at_900000_TOP_100",
    "level_at_900000_TOP_100",
    "team_TOP_100_totalGold_at_900000",  # Alternative format
    "team_100_TOP_totalGold_at_900000",  # Another alternative format
]

# Randomly sample files instead of taking the first few
random_files = random.sample(input_files, min(NUM_FILES_TO_CHECK, len(input_files)))

# Load and examine the randomly selected files
for i, file_path in enumerate(random_files):
    print(f"\n----- File {i+1}: {os.path.basename(file_path)} -----")

    df = load_data(file_path)

    # Basic info
    print(f"Rows: {len(df)}")
    print(f"Total columns: {len(df.columns)}")

    # Check for our expected columns
    found_columns = [col for col in columns_to_check if col in df.columns]
    print(f"Found {len(found_columns)} of our expected columns")

    # Look for pattern matches
    time_related_cols = [col for col in df.columns if "900000" in col]
    print(f"Time-related columns (900000): {len(time_related_cols)}")

    gold_related_cols = [col for col in df.columns if "Gold" in col or "gold" in col]
    print(f"Gold-related columns: {len(gold_related_cols)}")

    # Find columns that might have timeline data
    timeline_cols = [
        col
        for col in df.columns
        if any(time in col for time in ["at_", "At", "time", "Time"])
    ]
    print(f"Possible timeline columns: {len(timeline_cols)}")

    # Display sample data - columns related to game stats
    role_cols = [
        col
        for col in df.columns
        if any(role in col for role in ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"])
    ]
    print(f"Role-related columns: {len(role_cols)}")

    # Show some actual column names as examples
    print("\nSample column names:")
    if role_cols:
        print("Role-related columns:", role_cols[:5])
    if time_related_cols:
        print("Time-related columns:", time_related_cols[:5])
    if gold_related_cols:
        print("Gold-related columns:", gold_related_cols[:5])

    # Look at actual data structure
    if len(df) > 0:
        print("\nFirst row column keys that might contain timeline data:")
        # Look for nested structures or complex objects
        for col in df.columns:
            try:
                val = df[col].iloc[0]
                if isinstance(val, (dict, list)) or "timeline" in str(col).lower():
                    print(f"Column: {col}, Type: {type(val)}")
                    if isinstance(val, dict) and len(val) < 10:
                        print(f"  Content: {val}")
                    elif isinstance(val, list) and len(val) < 10:
                        print(f"  Content: {val}")
            except:
                pass

    # Store our findings
    file_findings[os.path.basename(file_path)] = {
        "row_count": len(df),
        "column_count": len(df.columns),
        "has_expected_columns": len(found_columns) > 0,
        "gold_columns": len(gold_related_cols),
        "timeline_columns": len(timeline_cols),
        "role_columns": len(role_cols),
    }

# Summary
print("\n===== SUMMARY =====")
print(f"Examined {NUM_FILES_TO_CHECK} files")
files_with_expected_cols = sum(
    1 for f in file_findings.values() if f["has_expected_columns"]
)
print(f"Files with expected columns: {files_with_expected_cols}/{NUM_FILES_TO_CHECK}")

# If we found nested timelines, suggest how to access them
if any(
    "timeline" in " ".join(df.columns).lower()
    for df in [load_data(f) for f in input_files[:NUM_FILES_TO_CHECK]]
):
    print(
        "\nPOTENTIAL SOLUTION: The raw data appears to have timeline information in nested structures."
    )
    print("You may need to extract the timeline data first before applying filters.")
    print(
        "Try examining the timeline columns to see how to extract the 15-minute stats."
    )