# Valorant dataset extracter and cleaner

This is used to extracted the zip files of valorant esports data scrapped from vlr.gg and clean or format the dataset.



In [1]:
import pandas as pd
import os
import zipfile
pd.set_option('display.max_columns', None)

### Select the zip files you want to extract
Select the zip files which you want to extract from Zip_files folder

In [None]:
import ipywidgets as widgets
from IPython.display import display

# Folder containing zip files
zip_folder = "zip_files"

# Get all .zip files from the folder
zip_files = [f for f in os.listdir(zip_folder) if f.endswith(".zip")]

# Create a checkbox for each zip file
checkboxes = [widgets.Checkbox(value=False, description=f) for f in zip_files]

# Create a button for processing selected files
select_button = widgets.Button(description="Process Selected Zips", button_style="success")

# Output box to show results/messages
output = widgets.Output()

# Function to run when button is clicked
def on_button_click(b):
    global selected_files
    with output:  # Redirect prints to output box
        output.clear_output()  # Clear previous output
        # Get list of files where checkbox is ticked
        selected_files = [cb.description for cb in checkboxes if cb.value]
        if selected_files:
            print("Selected zip files for processing:")
            for f in selected_files:
                print("-", f)
        else:
            print("No zip files selected.")

# Attach the function to button click event
select_button.on_click(on_button_click)

# Show checkboxes, button, and output area in notebook
display(widgets.VBox(checkboxes), select_button, output)


VBox(children=(Checkbox(value=False, description='Valorant Masters Bangkok 2025_csvs.zip'), Checkbox(value=Fal…

Button(button_style='success', description='Process Selected Zips', style=ButtonStyle())

Output()

### Extract the select zip fils in "Output" folder

In [4]:
print("Extracting selected zip files:\n")

output_folder = "Output"
# Loop through each file in the list of selected zip files
for f in selected_files:
    # Build the full path to the current zip file
    zip_path = os.path.join(zip_folder, f)
    
    # Create a unique output folder for this zip file (removes .zip extension)
    extract_path = os.path.join(output_folder, os.path.splitext(f)[0])
    os.makedirs(extract_path, exist_ok=True)  # Create folder if it doesn’t exist
    
    # Open the zip file in read mode and extract its contents
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)   

    print(f"✔ Extracted: {f} → {extract_path}")

Extracting selected zip files:

✔ Extracted: Valorant Masters Bangkok 2025_csvs.zip → Output\Valorant Masters Bangkok 2025_csvs


### Load the csv files into respective dataframes

In [5]:
def load_csv(extract_path):
    detailed_matches_overview_df = pd.read_csv(os.path.join(extract_path,"detailed_matches_overview.csv"))
    matches_df = pd.read_csv(os.path.join(extract_path,"matches.csv"))
    player_stats_df = pd.read_csv(os.path.join(extract_path,"player_stats.csv"))
    detailed_matches_player_stats_df = pd.read_csv(os.path.join(extract_path,"detailed_matches_player_stats.csv"))
    event_info_df = pd.read_csv(os.path.join(extract_path,"event_info.csv"))

    return detailed_matches_overview_df, matches_df, player_stats_df, detailed_matches_player_stats_df, event_info_df




### Fix the list of agents in player_stats.csv
All agents played by the player during the tournament is not able to fetch from vlr.gg due to limited data available in player stats page

In [6]:
def fix_agents(detailed_matches_player_stats_df,player_stats_df):
    df_agents = (
    detailed_matches_player_stats_df
    .assign(agent=detailed_matches_player_stats_df["agent"].str.split(","))      # Step 1: Split the "agent" column (which is a string like "Jett, Raze") into a list ["Jett", "Raze"] 
    .explode("agent")   # Step 2: Explode the "agent" list so each element becomes its own row  # Example: ["Jett", "Raze"] → 2 rows: "Jett", "Raze"
    .assign(agent=lambda x: x["agent"].str.strip().str.title())  # Step 3: Clean the "agent" strings → remove extra spaces and make them Title Case  
    )

    player_agent = (
    df_agents
    .groupby("player_id")["agent"]  # group agents by player id
    .unique()                       # keep only unique agents - .unique() function returns a NumPy array, not a Python list.
    .apply(list)                    # convert numpy arrays to lists
    .to_dict()                       # final dictionary {player_id: [agents]}
    )

        # Only insert if column does not already exist
    if "agents" not in player_stats_df.columns:
        player_stats_df.insert(6, "agents", player_stats_df["player_id"].map(player_agent))
    else:
        # If exists, update values
        player_stats_df["agents"] = player_stats_df["player_id"].map(player_agent)

    return player_stats_df


### Drop unnecessary columns

In [7]:
def drop_columns(detailed_matches_overview_df, matches_df, player_stats_df, event_info_df):
    detailed_matches_overview_df = detailed_matches_overview_df.drop(columns = ["match_url"], errors = "ignore")
    matches_df = matches_df.drop(columns = ["match_url","scraped_at"], errors = "ignore")
    event_info_df = event_info_df.drop(columns = ["scraped_at"], errors = "ignore")
    player_stats_df = player_stats_df.drop(columns=["agents_played","agents_display","player_url","scraped_at"], errors = 'ignore')

    return detailed_matches_overview_df, matches_df, player_stats_df, event_info_df

### Save the changes to csv files in output folder 

In [8]:
def save_csv(detailed_matches_overview_df, matches_df, player_stats_df, event_info_df,extract_path):
    player_stats_df.to_csv(os.path.join(extract_path,"player_stats.csv"), index = False)
    event_info_df.to_csv(os.path.join(extract_path,"event_info.csv"), index = False)
    detailed_matches_overview_df.to_csv(os.path.join(extract_path,"detailed_matches_overview.csv"), index = False)
    matches_df.to_csv(os.path.join(extract_path,"matches.csv"), index = False)

### Calling above functions

In [9]:
# Process extracted zip files
print("\nProcessing extracted zip files...\n")

for f in selected_files:
    # Path of the extracted folder
    extract_path = os.path.join(output_folder, os.path.splitext(f)[0])
    print(f"🔹 Processing folder: {extract_path}")

    # Load data from the specific folder for this tournament
    detailed_matches_overview_df, matches_df, player_stats_df, detailed_matches_player_stats_df, event_info_df = load_csv(extract_path)

    # Process The tournament data
    player_stats_df = fix_agents(detailed_matches_player_stats_df,player_stats_df)
    detailed_matches_overview_df, matches_df,player_stats_df,event_info_df = drop_columns(detailed_matches_overview_df, matches_df, player_stats_df, event_info_df)
    
    # Save the changes in CSV
    save_csv(detailed_matches_overview_df, matches_df, player_stats_df, event_info_df, extract_path)
        

print("\n✅ All extracted zip files processed individually.")



Processing extracted zip files...

🔹 Processing folder: Output\Valorant Masters Bangkok 2025_csvs

✅ All extracted zip files processed individually.
