In [2]:
# Reading the dataset
# Count the number of beatmaps and difficulties

import os
import numpy as np
import rosu_pp_py as osu
import pandas as pd

def count_beatmaps():
    beatmaps_count = 0
    difficulties_count = 0
    for root, dirs, files in os.walk("dataset/beatmaps"):
        # The number of top-level folders corresponds to the number of beatmaps
        beatmaps_count += len(dirs)
        # The number of .osu files (in all subfolders) corresponds to the number of difficulties
        difficulties_count += len([name for name in files if name.endswith(".osu")])
    return beatmaps_count, difficulties_count

beatmaps_count, difficulties_count = count_beatmaps()
print("Beatmaps: ", beatmaps_count)
print("Difficulties: ", difficulties_count)

# Save path to beatmap .osu files in a list, respecting the folder structure
# e.g. "dataset/beatmaps/beatmap1/difficulty1.osu". Every beatmap gets a list of paths to its difficulties

def get_beatmap_paths():
    beatmap_paths = []
    # Respect the folder structure
    for root, dirs, files in os.walk("dataset/beatmaps"):
        for dir in dirs:
            beatmap_path = []
            for root, dirs, files in os.walk(os.path.join("dataset/beatmaps", dir)):
                for file in files:
                    if file.endswith(".osu"):
                        beatmap_path.append(os.path.join(root, file))
            beatmap_paths.append(beatmap_path)
    
    return beatmap_paths

beatmaps = get_beatmap_paths()

Beatmaps:  356
Difficulties:  1718


In [16]:
# Prepare the dataset
# Read metadata from .osu files

def read_metadata(beatmaps : list[list[str]]):
    metadata = {}
    for difficulty in beatmaps:
        for file in difficulty:
            with open(file, "r", encoding="utf-8") as f:
                lines = f.readlines()
                for line in lines:
                    if line.startswith("Title:"):
                        title = line.split(":")[1].strip()
                    elif line.startswith("Artist:"):
                        artist = line.split(":")[1].strip()
                    elif line.startswith("Version:"):
                        version = line.split(":")[1].strip()
                metadata[title] = {"artist": artist, "version": version}
    return metadata

map = osu.Beatmap(path = beatmaps[0][0])
perf = osu.Performance()
attr = perf.calculate(map)
max_attrs = perf.calculate(attr)

print("AR", max_attrs.difficulty.ar)
print("Star Rating", max_attrs.difficulty.stars)
print("PP", max_attrs.pp)
print("BPM", map.bpm)
print("Song", beatmaps[0][0])

AR 9.800000190734863
Star Rating 8.077242505421664
PP 834.5441923076489
BPM 190.0002850004275
Song dataset/beatmaps/1003201 Ata - Euphoria (evilxmaniac)/Ata - Euphoria (evilxmaniac) [Ciyus Miapah's Ultimate Power].osu


In [1]:
# Create a pandas DataFrame
# Each song has multiple difficulties

# First, define a function which extracts relative time and space distances as well as direction changes between each hit object
# Differentiate between hit objects and sliders by checking the number of coordinates in the line; also give sliders an identifier (boolean)

from tracemalloc import start


def extract_and_convert_hit_objects(beatmapPath : str):
    time_diffs = []
    space_diffs = []
    direction_diffs = []
    # Types: jump, slider, spinner
    type = []
    
    # Timing points
    # 0: offset, 1: milliseconds per beat, 2: meter, 3: sample set, 4: sample index, 5: volume, 6: inherited, 7: kiai
    # Convert this to: start time, end time, bpm, slider velocity (True/False)
    timing_points = []
    
    if beatmapPath is None or beatmapPath == "" or not os.path.exists(beatmapPath) or not beatmapPath.endswith(".osu"): raise ValueError("Invalid beatmap path")

    def hit_type_encoder(hit_type : int):
        # 0: hit circle, 1: slider, 2: spinner
        if hit_type & 1: return 0
        if hit_type & 2: return 1
        if hit_type & 8: return 2
    
    with open(beatmapPath, "r", encoding="utf-8") as f:
        lines = f.readlines()
        for line in lines:
            # Extract timing points
            if line.startswith("[TimingPoints]"):
                timing_points = lines[lines.index(line) + 1:]
                # Cut off lines after timing points (until beginning of hit objects)
                timing_points = timing_points[:timing_points.index("[HitObjects]\n")]
                
                for i, timing_point in enumerate(timing_points):
                    if timing_point == "\n":
                        continue
                    timing_point = timing_point.split(",")
                    start_time = int(timing_point[0])
                    
                    if float(timing_point[1]) < 0:
                        # Inherited timing point / slider velocity
                        # Take the previous timing point's bpm (there should be at least one, since the first timing point is always a non-inherited one)
                        bpm = timing_points[-1][1]
                        slider_velocity = True
                        
                        if timing_points[i + 1] == "\n":
                            # This is the last timing point (last line is always empty. Second last line is the last hit object)
                            end_time = lines[-2].split(",")[2]
                        
            
            # Extract hit objects
            if line.startswith("[HitObjects]"):
                hit_objects = lines[lines.index(line) + 1:]
                
                # Initialize variables
                # Determine the relation between hit objects. The hit type of the previous object determines whether the move is a jump or a slider-hold
                x_prev = int(hit_objects[0].split(",")[0])
                y_prev = int(hit_objects[0].split(",")[1])
                time_prev = 0
                hit_type_prev = 0
                
                for hit_object in hit_objects:
                    if hit_object == "\n":
                        continue
                    hit_object = hit_object.split(",")
                    x = int(hit_object[0])
                    y = int(hit_object[1])
                    time = int(hit_object[2])
                    hit_type = hit_type_encoder(int(hit_object[3]))
                    
                    if hit_type == 0:
                        # Current object is a hit circle
                        time_diffs.append(time - time_prev)
                        space_diffs.append(np.sqrt((x - x_prev) ** 2 + (y - y_prev) ** 2))
                        
                        # Calculate direction change, but check for division by zero (if hit objects are at the same position)
                        if space_diffs[-1] == 0:
                            direction_diffs.append(0)
                        else:
                            direction_diffs.append(np.arctan2(y - y_prev, x - x_prev))
                        # Jump
                        type.append(hit_type_prev)
                    elif hit_type == 1:
                        # Current object is a slider
                        # Two calculations are needed. The first move is a jump (either from a slider-end or a hit circle)
                        # The second move is the slider movement itself
                        # For linear sliders, one additional point is given
                        # For perfect circles, the slider has two additional points
                        # For Bezier curves, the number of points can be arbitrary
                        # FIRST ROUGH IMPLEMENTATION: Approximate each slider part as a straight line
                        
                        time_diffs.append(time - time_prev)
                        space_diffs.append(np.sqrt((x - x_prev) ** 2 + (y - y_prev) ** 2))
                        
                        # Calculate direction change, but check for division by zero (if hit objects are at the same position)
                        if space_diffs[-1] == 0:
                            direction_diffs.append(0)
                        else:
                            direction_diffs.append(np.arctan2(y - y_prev, x - x_prev))
                        # Jump (from slider-end or hit circle)
                        type.append(hit_type_prev)
                        
                        # Now calculate the slider movement(s)
                        type_of_slider = hit_object[5][0]
                        if type_of_slider == "L":
                            # Linear slider
                            # Format: ...,L|x1:y1,...
                            # The slider has one additional point
                            coordinates = hit_object[5].split("|")[1]
                            x_end = int(coordinates.split(":")[0])
                            y_end = int(coordinates.split(":")[1])
                            time_end = int(hit_object[6])
                            time_diffs.append(time_end - time)
                            space_diffs.append(np.sqrt((x_end - x) ** 2 + (y_end - y) ** 2))
                            if space_diffs[-1] == 0:
                                direction_diffs.append(0)
                            else:
                                direction_diffs.append(np.arctan2(y_end - y, x_end - x))
                            # Slider
                            type.append(1)
                    
                    x_prev = x
                    y_prev = y
                    time_prev = time
                    hit_type_prev = hit_type
    