In [None]:
from datasets import load_dataset

In [None]:
#dataset = load_dataset('Simon-Kotchou/lichess-puzzles')
#dataset

In [None]:
from datasets import Dataset, Features, ClassLabel, Sequence, Value
import requests
import zstandard as zstd
import chess.pgn
import io

# Function to read URLs from a text file
def read_urls_from_file(file_path):
    with open(file_path, 'r') as file:
        urls = [line.strip() for line in file if line.strip()]
    return urls

def pgn_data_generator(urls):
    dctx = zstd.ZstdDecompressor()
    for url in urls:
        with requests.get(url, stream=True) as response:
            response.raise_for_status()
            with dctx.stream_reader(response.raw) as reader:
                text_stream = io.TextIOWrapper(reader, encoding='utf-8')
                pgn_file = io.StringIO(''.join(text_stream))
                while True:
                    game = chess.pgn.read_game(pgn_file)
                    if game is None:
                        break
                    game_info = {}
                    # Providing default values for missing fields
                    for key in ['Event', 'Site', 'Date', 'Round', 'White', 'Black', 'Result', 'UTCDate', 'UTCTime']:
                        game_info[key] = game.headers.get(key, 'Unknown')
                    game_info["Moves"] = game.board().variation_san(game.mainline_moves())
                    yield game_info

# Function to create a Hugging Face dataset from the generator
def create_pgn_dataset(file_path):
    urls = [read_urls_from_file(file_path)[5]]
    
    # Define the dataset features
    features = Features({
        'Event': Value('string'),
        'Site': Value('string'),
        'Date': Value('string'),
        'Round': Value('string'),
        'White': Value('string'),
        'Black': Value('string'),
        'Result': Value('string'),
        'Moves': Value('string'),
        'UTCDate': Value('string'),
        'UTCTime': Value('string')
    })

    # Create the dataset from the generator
    return Dataset.from_generator(generator=lambda: pgn_data_generator(urls), features=features)

In [None]:
# File path to the text file containing URLs
file_path = '../data/960_urls.txt'  # Update this path as necessary

# Create the dataset
dataset = create_pgn_dataset(file_path)

# Example: Accessing the first item in the dataset
first_item = dataset[0]
print(first_item)

In [None]:
dataset.push_to_hub('Simon-Kotchou/Lichess-960', split='part_202309')