In [None]:
import pandas as pd
import os
import pickle
from utils.match_prediction import RAW_DATA_DIR, RAW_AZURE_DIR

# Load the Parquet file

train_file_path = os.path.join(RAW_AZURE_DIR, "2025-01-20T01-21-31-259Z.parquet")
df = pd.read_parquet(train_file_path)



In [None]:
for col in df.columns:
    print(col)

In [None]:
# League of Legends Match Outcome Outlier Detection
# ================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
from utils.match_prediction import RAW_DATA_DIR, RAW_AZURE_DIR
from IPython.display import display
import warnings

warnings.filterwarnings("ignore")

# Set plot style for better visibility
plt.style.use("fivethirtyeight")
sns.set_palette("Set2")

# Section 1: Load and Sample Data
# -------------------------------


def load_multiple_batches(num_batches=5, max_samples=10000):
    """
    Load and combine multiple raw data batches.

    Parameters:
    -----------
    num_batches : int
        Number of batch files to load
    max_samples : int
        Maximum number of samples to return (to avoid memory issues)

    Returns:
    --------
    pd.DataFrame
        Combined dataframe from all batches
    """
    all_data = []

    # Get all parquet files from the directory
    parquet_files = [f for f in os.listdir(RAW_AZURE_DIR) if f.endswith(".parquet")]
    # Sort by timestamp (newest first)
    parquet_files.sort(reverse=True)

    # Take only the requested number of batches
    files_to_process = parquet_files[:num_batches]

    for filename in files_to_process:
        file_path = os.path.join(RAW_AZURE_DIR, filename)

        try:
            batch_df = pd.read_parquet(file_path)
            all_data.append(batch_df)
            print(f"Loaded {filename}: {len(batch_df)} rows")
        except FileNotFoundError:
            print(f"File not found: {filename}")

        # Check if we've reached the sample limit
        if sum(len(df) for df in all_data) >= max_samples:
            break

    # Combine all dataframes
    combined_df = pd.concat(all_data, ignore_index=True)

    # Sample if we have too many rows
    if len(combined_df) > max_samples:
        combined_df = combined_df.sample(max_samples, random_state=42)

    print(f"Total samples: {len(combined_df)}")
    return combined_df


# Load data (adjust parameters as needed)
df = load_multiple_batches(num_batches=50, max_samples=50000)

# Display basic info
print("\nDataframe Info:")
df.info()

print("\nSample Data:")
display(df.head())

In [None]:
# Section 2: Data Overview
# -----------------------

# Get basic statistics
print("\nBasic Statistics:")
display(df.describe())

# Check for missing values
print("\nMissing Values:")
missing_values = df.isnull().sum()
display(missing_values[missing_values > 0])

In [None]:
# Section 3: Univariate Distribution Analysis
# ------------------------------------------


def plot_column_distribution(dataframe, column_name, figsize=(12, 6)):
    """
    Plot distribution of a column and identify outliers using different methods.

    Parameters:
    -----------
    dataframe : pd.DataFrame
        Input dataframe
    column_name : str
        Column to analyze
    figsize : tuple
        Figure size
    """
    if column_name not in dataframe.columns:
        print(f"Column '{column_name}' not found in dataframe!")
        return

    # Skip non-numeric columns
    if not pd.api.types.is_numeric_dtype(dataframe[column_name]):
        print(f"Column '{column_name}' is not numeric. Skipping distribution analysis.")
        return

    data = dataframe[column_name].copy()

    # Create a figure with subplots
    fig, ax = plt.subplots(2, 2, figsize=figsize)
    fig.suptitle(f"Distribution Analysis for {column_name}", fontsize=16)

    # Plot 1: Histogram with KDE
    sns.histplot(data=data, kde=True, ax=ax[0, 0])
    ax[0, 0].set_title("Histogram with Density")

    # Plot 2: Box plot for outlier visualization
    sns.boxplot(x=data, ax=ax[0, 1])
    ax[0, 1].set_title("Box Plot (Outliers as points)")

    # Plot 3: Z-score outlier detection
    z_scores = np.abs((data - data.mean()) / data.std())
    outliers_z = data[z_scores > 3]

    sns.scatterplot(x=range(len(data)), y=data, alpha=0.5, ax=ax[1, 0])
    if not outliers_z.empty:
        sns.scatterplot(
            x=outliers_z.index, y=outliers_z.values, color="red", s=50, ax=ax[1, 0]
        )
    ax[1, 0].set_title(f"Z-Score Outliers (>3σ): {len(outliers_z)} points")
    ax[1, 0].set_xlabel("Index")
    ax[1, 0].set_ylabel(column_name)

    # Plot 4: IQR outlier detection
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    outliers_iqr = data[(data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))]

    sns.scatterplot(x=range(len(data)), y=data, alpha=0.5, ax=ax[1, 1])
    if not outliers_iqr.empty:
        sns.scatterplot(
            x=outliers_iqr.index, y=outliers_iqr.values, color="red", s=50, ax=ax[1, 1]
        )
    ax[1, 1].set_title(f"IQR Outliers: {len(outliers_iqr)} points")
    ax[1, 1].set_xlabel("Index")
    ax[1, 1].set_ylabel(column_name)

    plt.tight_layout()
    plt.subplots_adjust(top=0.9)
    plt.show()

    # Display statistical summary of outliers
    if not outliers_z.empty or not outliers_iqr.empty:
        print(f"\nOutlier Statistics for {column_name}:")
        print(
            f"Z-score outliers (>3σ): {len(outliers_z)} ({len(outliers_z)/len(data)*100:.2f}%)"
        )
        print(
            f"IQR outliers: {len(outliers_iqr)} ({len(outliers_iqr)/len(data)*100:.2f}%)"
        )

        # Get some example outliers
        if not outliers_z.empty:
            print("\nTop Z-score outliers:")
            extreme_z = outliers_z.sort_values(ascending=False).head(5)
            for idx, val in extreme_z.items():
                print(f"  Index {idx}: {val} (z-score: {z_scores[idx]:.2f})")

        if not outliers_iqr.empty:
            print("\nTop IQR outliers:")
            extreme_iqr = outliers_iqr.sort_values(ascending=False).head(5)
            for idx, val in extreme_iqr.items():
                print(
                    f"  Index {idx}: {val} (Distance from Q3: {val-Q3:.2f})"
                    if val > Q3
                    else f"  Index {idx}: {val} (Distance from Q1: {Q1-val:.2f})"
                )

    return outliers_z, outliers_iqr


# Interactive function to analyze any column
def analyze_column(dataframe, column_name=None):
    """
    Interactive function to analyze a specific column or select from available columns.

    Parameters:
    -----------
    dataframe : pd.DataFrame
        Input dataframe
    column_name : str, optional
        Column to analyze. If None, user will be prompted to select
    """
    if column_name is None:
        # Get numeric columns
        numeric_columns = dataframe.select_dtypes(include=["number"]).columns.tolist()

        print("Available numeric columns:")
        for i, col in enumerate(numeric_columns):
            print(f"{i+1}: {col}")

        selection = input("\nEnter column number to analyze (or 'q' to quit): ")
        if selection.lower() == "q":
            return

        try:
            column_name = numeric_columns[int(selection) - 1]
        except (ValueError, IndexError):
            print("Invalid selection. Please try again.")
            return

    # Plot the distribution
    outliers_z, outliers_iqr = plot_column_distribution(dataframe, column_name)

    # Analyze rows with outliers
    if not outliers_iqr.empty:
        print("\nSample rows containing outliers:")
        # Get the indexes of some outliers
        outlier_indexes = outliers_iqr.index[:5]
        display(dataframe.loc[outlier_indexes])


analyze_column(df, "team_100_TOP_deaths_at_1200000")