In [None]:
import pandas as pd
import os
import pickle
from utils.match_prediction import PREPARED_DATA_DIR, RAW_DATA_DIR, RAW_AZURE_DIR

# Load the Parquet file

train_file_path = os.path.join(PREPARED_DATA_DIR, "train", "train_0.parquet")
df = pd.read_parquet(train_file_path)

df.head()

In [None]:
for col in df.columns:
    print(col)


In [None]:
import tqdm
import glob

import matplotlib.pyplot as plt
import seaborn as sns

# Initialize counters
elo_counts: dict[int, int] = {}
patch_counts: dict[float, int] = {}

# Aggregate counts across all files
input_files = glob.glob(os.path.join(PREPARED_DATA_DIR, "train", "*.parquet"))
for file_path in tqdm.tqdm(input_files):
    df = pd.read_parquet(file_path)
    
    # Update elo counts
    file_elo_counts = df["numerical_elo"].value_counts().to_dict()
    for elo, count in file_elo_counts.items():
        elo_counts[elo] = elo_counts.get(elo, 0) + count
    
    # Update patch counts
    file_patch_counts = df["numerical_patch"].value_counts().to_dict()
    for patch, count in file_patch_counts.items():
        patch_counts[patch] = patch_counts.get(patch, 0) + count

# Convert to DataFrames for easier plotting
elo_df = pd.DataFrame(list(elo_counts.items()), columns=['Elo', 'Count']).sort_values('Elo')
patch_df = pd.DataFrame(list(patch_counts.items()), columns=['Patch', 'Count']).sort_values('Patch')

# Print total counts
print("\nTotal games per Elo:")
print(elo_df.to_string(index=False))
print("\nTotal games per Patch:")
print(patch_df.to_string(index=False))

# Create plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))

# Elo distribution plot
sns.barplot(data=elo_df, x='Elo', y='Count', ax=ax1)
ax1.set_title('Distribution of Games by Elo')
ax1.set_xlabel('Numerical Elo')
ax1.set_ylabel('Number of Games')
ax1.tick_params(axis='x', rotation=45)

# Patch distribution plot
sns.barplot(data=patch_df, x='Patch', y='Count', ax=ax2)
ax2.set_title('Distribution of Games by Patch')
ax2.set_xlabel('Numerical Patch')
ax2.set_ylabel('Number of Games')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
file_from_azure = os.path.join(RAW_AZURE_DIR, "2024-11-04T23-05-22-368Z.parquet")
df_from_azure = pd.read_parquet(file_from_azure)

df_from_azure.head()


In [None]:
for col in df_from_azure.columns:
    print(col)
