In [47]:
!sudo apt-get update
!sudo apt-get install stockfish

Hit:1 https://packages.microsoft.com/repos/microsoft-ubuntu-focal-prod focal InRelease
Hit:2 https://dl.yarnpkg.com/debian stable InRelease                           
Hit:3 https://repo.anaconda.com/pkgs/misc/debrepo/conda stable InRelease       
Hit:4 http://security.ubuntu.com/ubuntu focal-security InRelease               
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease                         
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [128 kB] 
Hit:7 http://archive.ubuntu.com/ubuntu focal-backports InRelease               
Hit:8 https://packagecloud.io/github/git-lfs/ubuntu focal InRelease
Fetched 128 kB in 1s (122 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
stockfish is already the newest version (11-1build1).
0 upgraded, 0 newly installed, 0 to remove and 85 not upgraded.


In [1]:
!pip install --upgrade pip
!pip install chess zstandard



Edit pgn file to remove unnecessary details

In [49]:
import re

def reinstate_move_numbers(moves):
    tokens = moves.strip().split()
    result = []
    for i in range(0, len(tokens), 2):
        move_number = i // 2 + 1
        if i + 1 < len(tokens):
            result.append(f"{move_number}. {tokens[i]} {tokens[i+1]}")
        else:
            result.append(f"{move_number}. {tokens[i]}")
    return " ".join(result)

def clean_and_reformat_pgn(pgn_text):
    games = re.split(r'\n\s*\n', pgn_text.strip())

    cleaned_games = []

    for game in games:
        # Trenne Header und Züge
        header_lines = []
        move_lines = []

        for line in game.strip().splitlines():
            if line.startswith('['):
                header_lines.append(line)
            else:
                move_lines.append(line)

        header = "\n".join(header_lines)
        moves_raw = " ".join(move_lines)

        # Remove unnecessary symbols (information)
        moves = re.sub(r"\{[^}]*\}", "", moves_raw)
        
        moves = re.sub(r"\d+\.\.\.", "", moves)
        # Entferne alle Zugnummern (z. B. 1.)
        moves = re.sub(r"\d+\.", "", moves)
        # Reduziere Leerzeichen
        moves = re.sub(r"\s+", " ", moves).strip()
        # Setze Zugnummern korrekt neu
        moves_numbered = reinstate_move_numbers(moves)

        cleaned_game = f"{header}\n\n{moves_numbered}"
        cleaned_games.append(cleaned_game)

    return "\n\n".join(cleaned_games)

# Datei laden
with open("/workspaces/Chess_Stockfish/lichess_sample_10000 (1).pgn", "r", encoding="utf-8") as file:
    raw_pgn = file.read()

# Bereinigen und umformatieren
cleaned_pgn = clean_and_reformat_pgn(raw_pgn)

# In Datei schreiben
with open("/workspaces/Chess_Stockfish/cleaned.pgn", "w", encoding="utf-8") as out_file:
    out_file.write(cleaned_pgn)



Reduce empty lines in pgn

In [50]:
def reduce_blank_lines_in_pgn(input_path, output_path):
    with open(input_path, "r", encoding="utf-8") as infile:
        lines = infile.readlines()

    cleaned_lines = []
    current_game = []
    blank_line_count = 0

    def flush_game(game_lines):
        """Schreibe ein einzelnes Spiel mit genau einer Leerzeile nach dem Header."""
        cleaned = []
        header_ended = False
        blank_line_count = 0
        for line in game_lines:
            stripped = line.strip()
            if stripped.startswith("["):  
                cleaned.append(line)
                header_ended = False
                blank_line_count = 0
            elif stripped == "":
                if not header_ended:
                    if blank_line_count == 0:
                        cleaned.append("\n")
                    blank_line_count += 1
                    header_ended = True
            else:
                cleaned.append(line)
                blank_line_count = 0
        cleaned.append("\n")  # <== Spiel-Ende: genau eine Leerzeile nach dem Spiel
        return cleaned

    for line in lines:
        if line.strip() == "" and not current_game:
            # Leere Zeile vor einem Spielanfang -> überspringen
            continue
        current_game.append(line)
        if line.strip().startswith("[Event") and len(current_game) > 1:
            # Neues Spiel beginnt -> vorheriges Spiel verarbeiten
            cleaned_lines.extend(flush_game(current_game[:-1]))
            current_game = [line]

    if current_game:
        cleaned_lines.extend(flush_game(current_game))

    with open(output_path, "w", encoding="utf-8") as outfile:
        outfile.writelines(cleaned_lines)
reduce_blank_lines_in_pgn("/workspaces/Chess_Stockfish/cleaned.pgn", "cleaned_one_line.pgn")

Funktion to analyse every game 

In [13]:
import chess
import chess.pgn

#Elo scales, 1978 FIDE 
def get_elo_bucket(elo):
    if elo < 1000:
        return "beginner" #Novices 
    elif elo < 1400:
        return "intermediate" #Class E and D
    elif elo < 1800:
        return "club_player" #Class C and B
    elif elo < 2200:
        return "advanced" #Class A and Experts
    else:
        return "expert" #Masters
    
def result_to_label(result, player):
    if result == "1-0":
        return "win" if player == "white" else "loss"
    elif result == "0-1":
        return "win" if player == "black" else "loss"
    elif result == "1/2-1/2":
        return "draw"
    else:
        return "unknown"    

def analyze_game(game, engine, game_nr, depth=19):
    board = game.board()
    data = []

    white_elo = int(game.headers.get("WhiteElo", 0))
    black_elo = int(game.headers.get("BlackElo", 0))
    result = game.headers.get("Result", "*")  #"1-0", "0-1", "1/2-1/2"

    for ply, move in enumerate(game.mainline_moves()):
        san_move = board.san(move)

        # Bewertung vor dem Zug
        info_before = engine.analyse(board, chess.engine.Limit(depth=depth))
        score_before = info_before["score"].white().score(mate_score=10000) if not info_before["score"].is_mate() else 10000

        # Engine-Zug berechnen
        best_move = info_before["pv"][0] if "pv" in info_before else None
        if best_move:
            best_board = board.copy()
            best_board.push(best_move)
            info_best = engine.analyse(best_board, chess.engine.Limit(depth=depth))
            score_best = info_best["score"].white().score(mate_score=10000) if not info_best["score"].is_mate() else 10000
        else:
            score_best = None

        # Tatsächlichen Zug spielen
        board.push(move)
        info_after = engine.analyse(board, chess.engine.Limit(depth=depth))
        score_after = info_after["score"].white().score(mate_score=10000) if not info_after["score"].is_mate() else 10000

        # Bewertung
        if best_move == move:
            move_quality = 0
            if ply % 2 == 0:
                player = "white"
                elo_level = get_elo_bucket(white_elo)
            else:
                player = "black" 
                elo_level = get_elo_bucket(black_elo)  
        elif ply % 2 == 0:
            player = "white"
            elo_level = get_elo_bucket(white_elo)
            move_quality = score_after - score_best if score_best is not None else None
        else:
            player = "black"
            elo_level = get_elo_bucket(black_elo)
            move_quality = score_best - score_after if score_best is not None else None


        total_plies = len(list(game.mainline_moves()))
        if ply < total_plies * 0.3:
            phase = 'opening'
        elif ply < total_plies * 0.7:
            phase = 'middlegame'
        else:
            phase = 'endgame'

        data.append({
            "game_nr": game_nr,
            "ply": ply,
            "move": san_move,
            "move_quality": move_quality,
            "player": player,
            "elo_level": elo_level,
            "phase": phase,
            "result": result_to_label(result, player)
        })

    return data


In [12]:
import json
import os
import chess
import chess.engine
import chess.pgn

DATA_FILE_2 = "chess_data_all_phases.json"
PGN_FILE = "/workspaces/Chess_Stockfish/cleaned_one_line.pgn"
ENGINE_PATH = "/usr/games/stockfish"
MAX_GAMES = 1000

def save_data(new_data):
    if os.path.exists(DATA_FILE_2):
        with open(DATA_FILE_2, "r") as f:
            existing_data = json.load(f)
    else:
        existing_data = []
    existing_data.extend(new_data)
    with open(DATA_FILE_2, "w") as f:
        json.dump(existing_data, f, indent=2)

def load_data():
    if os.path.exists(DATA_FILE_2):
        with open(DATA_FILE_2, "r") as f:
            return json.load(f)
    else:
        return []

# === Main ===
engine = chess.engine.SimpleEngine.popen_uci(ENGINE_PATH)
all_data = load_data()

# Determine last analyzed game number
if all_data:
    last_game_nr = max(d["game_nr"] for d in all_data)
else:
    last_game_nr = 0

with open(PGN_FILE) as pgn:
    # Skip previously analyzed games
    for _ in range(last_game_nr):
        if chess.pgn.read_game(pgn) is None:
            break  # End of file reached

    game_count = 0
    while game_count < MAX_GAMES:
        game = chess.pgn.read_game(pgn)
        if game is None:
            break  # No more games

        current_game_nr = last_game_nr + game_count + 1
        print(f"Analysiere Spiel {current_game_nr}...")

        try:
            new_game_data = analyze_game(game, engine, game_nr=current_game_nr)

            expected_plies = len(list(game.mainline_moves()))
            if len(new_game_data) == expected_plies:
                save_data(new_game_data)
                print(f"Spiel {current_game_nr} gespeichert.")
            else:
                print(f"Spiel {current_game_nr} NICHT gespeichert – Analyse unvollständig.")

            game_count += 1
        except KeyboardInterrupt:
            print("Analyse abgebrochen. Spiel wird nicht gespeichert.")
            break

engine.quit()


Analysiere Spiel 1...
Spiel 1 gespeichert.
Analysiere Spiel 2...
Analyse abgebrochen. Spiel wird nicht gespeichert.


Game Opening Function

In [6]:
#Opening
import chess
import chess.pgn

#Elo scales, 1978 FIDE 
def get_elo_bucket(elo):
    if elo < 1000:
        return "beginner" #Novices 
    elif elo < 1400:
        return "intermediate" #Class E and D
    elif elo < 1800:
        return "club_player" #Class C and B
    elif elo < 2200:
        return "advanced" #Class A and Experts
    else:
        return "expert" #Masters
    
def result_to_label(result, player):
    if result == "1-0":
        return "win" if player == "white" else "loss"
    elif result == "0-1":
        return "win" if player == "black" else "loss"
    elif result == "1/2-1/2":
        return "draw"
    else:
        return "unknown"    


def analyze_opening(game, engine, game_nr, depth=20):
    board = game.board()
    moves = []

    white_elo = int(game.headers.get("WhiteElo", 0))
    black_elo = int(game.headers.get("BlackElo", 0))
    result = game.headers.get("Result", "*")

    total_plies = len(list(game.mainline_moves()))
    opening_limit = int(total_plies * 0.3)
    sequence = []
    engine_sequence = []

    for ply, move in enumerate(game.mainline_moves()):
        if ply >= opening_limit:
            break

        san_move = board.san(move)
        sequence.append(san_move)

        # Bewertung vor dem Zug
        info_before = engine.analyse(board, chess.engine.Limit(depth=depth))
        best_move = info_before["pv"][0] if "pv" in info_before else None

        if best_move:
            best_board = board.copy()
            best_board.push(best_move)
            info_best = engine.analyse(best_board, chess.engine.Limit(depth=depth))
            score_best = info_best["score"].white().score(mate_score=10000) if not info_best["score"].is_mate() else 10000
            engine_sequence.append(board.san(best_move))
        else:
            score_best = None
            engine_sequence.append(None)

        board.push(move)
        info_after = engine.analyse(board, chess.engine.Limit(depth=depth))
        score_after = info_after["score"].white().score(mate_score=10000) if not info_after["score"].is_mate() else 10000
        
        if best_move == move:
            move_quality = 0
            if ply % 2 == 0:
                player = "white"
            else:
                player = "black"   
        elif ply % 2 == 0:
            player = "white"
            move_quality = score_after - score_best if score_best is not None else None
        else:
            player = "black"
            move_quality = score_best - score_after if score_best is not None else None

        moves.append({
            "ply": ply,
            "move": san_move,
            "move_quality": move_quality,
            "player": player
        })

    return {
        "game_nr": game_nr,
        "white_elo": white_elo,
        "black_elo": black_elo,
        "elo_level_white": get_elo_bucket(white_elo),
        "elo_level_black": get_elo_bucket(black_elo),
        "result": result,
        "human_opening_sequence": " ".join(sequence),
        "engine_opening_sequence": " ".join(engine_sequence),
        "moves": moves
    }



In [34]:
import json
import os
import chess
import chess.engine
import chess.pgn
from io import StringIO

DATA_FILE = "chess_data_opening.json"

def save_data(new_data):
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, "r") as f:
            existing_data = json.load(f)
    else:
        existing_data = []

    existing_data.extend(new_data)

    with open(DATA_FILE, "w") as f:
        json.dump(existing_data, f, indent=2)

def load_data():
    if os.path.exists(DATA_FILE):
        with open(DATA_FILE, "r") as f:
            return json.load(f)
    else:
        return []

# === Hauptprogramm ===
PGN_FILE = "/workspaces/Chess_Stockfish/cleaned_one_line.pgn"
engine = chess.engine.SimpleEngine.popen_uci("/usr/games/stockfish")

MAX_GAMES = 10000
game_count = 0

# Bestehende Daten laden
all_data = load_data()

# Letzter gespeicherter game_nr ermitteln
if all_data:
    last_game_nr = max(d["game_nr"] for d in all_data)
else:
    last_game_nr = 0
#
current_game_nr = last_game_nr
#

with open(PGN_FILE) as pgn:
#
    for _ in range(current_game_nr):
        chess.pgn.read_game(pgn)

#

    while game_count < MAX_GAMES:
        game = chess.pgn.read_game(pgn)
        if game is None:
            break

        game_count += 1
        current_game_nr = last_game_nr + game_count
        print(f"Analysiere Spiel {current_game_nr}...")

        try:
            new_game_data = analyze_opening(game, engine, game_nr=current_game_nr)

            # Expected plies in opening phase (like in analyze_opening)
            total_plies = len(list(game.mainline_moves()))
            opening_limit = int(total_plies * 0.3)

            # Check if all opening moves were analyzed (length of moves list inside new_game_data)
            if len(new_game_data["moves"]) == opening_limit:
                save_data([new_game_data])  # Save as a list of one item
                print(f"Spiel {current_game_nr} gespeichert.")
            else:
                print(f"Spiel {current_game_nr} NICHT gespeichert – Analyse unvollständig.")
        except KeyboardInterrupt:
            print("Analyse abgebrochen. Spiel wird nicht gespeichert.")
            break

engine.quit()


Analysiere Spiel 342...
Spiel 342 gespeichert.
Analysiere Spiel 343...
Spiel 343 gespeichert.
Analysiere Spiel 344...
Spiel 344 gespeichert.
Analysiere Spiel 345...
Spiel 345 gespeichert.
Analysiere Spiel 346...
Spiel 346 gespeichert.
Analysiere Spiel 347...
Spiel 347 gespeichert.
Analysiere Spiel 348...
Spiel 348 gespeichert.
Analysiere Spiel 349...
Spiel 349 gespeichert.
Analysiere Spiel 350...
Spiel 350 gespeichert.
Analysiere Spiel 351...
Spiel 351 gespeichert.
Analysiere Spiel 352...
Spiel 352 gespeichert.
Analysiere Spiel 353...
Spiel 353 gespeichert.
Analysiere Spiel 354...
Spiel 354 gespeichert.
Analysiere Spiel 355...
Spiel 355 gespeichert.
Analysiere Spiel 356...
Spiel 356 gespeichert.
Analysiere Spiel 357...
Spiel 357 gespeichert.
Analysiere Spiel 358...
Spiel 358 gespeichert.
Analysiere Spiel 359...
Spiel 359 gespeichert.
Analysiere Spiel 360...
Spiel 360 gespeichert.
Analysiere Spiel 361...
Spiel 361 gespeichert.
Analysiere Spiel 362...
Spiel 362 gespeichert.
Analysiere Sp

In [7]:
import json

DATA_FILE_2 = "chess_data_all_phases.json"

# Load the data
with open(DATA_FILE_2, "r") as f2:
    all_data = json.load(f2)

•Opening Theory Compliance – Do players follow book openings correctly, or do they deviate early?•Strategic Conformance – How well do players adhere to established positional and tactical guidelines?
•Endgame Fundamentals – Are winning endgame techniques executed correctly, or do deviations impact results?•Skill-Based Differences – Do higher-rated players exhibit greater conformance, and how does deviation affect performance

opening phase already analyzed avg move_quality

percentage of different elo levels beeing black and white

percentage of players range certain deviations eg difference more than 100, 100-90...: how does avg move_quality affect win/loss regardless of elo_level

compare all move_qualities of div elo levels, dif phases and whole game to percentage games won by different elo levels

Because of stockfish the later the game phase the better move_quality??

Whole game 

In [29]:
move_qualities = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None]
avg_move_quality = sum(move_qualities) / len(move_qualities)
print(f"Average move_quality whole game all elo levels: {avg_move_quality}")

move_qualities_white = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None and entry["player"] == 'white']
avg_move_quality_white = sum(move_qualities_white) / len(move_qualities_white)
print(f"Average move_quality whole game all elo levels but only white players: {avg_move_quality_white}")

move_qualities_black = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None and entry["player"] == 'black']
avg_move_quality_black = sum(move_qualities_black) / len(move_qualities_black)
print(f"Average move_quality whole game all elo levels but only black players: {avg_move_quality_black}")

Average move_quality whole game all elo levels: -62.39818016115266
Average move_quality whole game all elo levels but only white players: -39.97016082711085
Average move_quality whole game all elo levels but only black players: -85.16157052496575


In [12]:
total_white = len([entry["game_nr"] for entry in all_data if entry["ply"] == 0])
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]
for level in elo_levels:
    number_games_elo_level_white = len([entry["game_nr"] for entry in all_data if entry["elo_level"] == level and entry["ply"] == 0])
    number_games_elo_level =  len([entry["game_nr"] for entry in all_data if entry["elo_level"] == level and entry["ply"] < 2])
    percentage_elo_level_white = number_games_elo_level_white / number_games_elo_level
    print(f"Percentage of players with elo_level {level} being player white: {percentage_elo_level_white}")    

Percentage of players with elo_level beginner being player white: 0.5037037037037037
Percentage of players with elo_level intermediate being player white: 0.5040214477211796
Percentage of players with elo_level club_player being player white: 0.4931506849315068
Percentage of players with elo_level advanced being player white: 0.5053272450532724
Percentage of players with elo_level expert being player white: 0.5185185185185185


Percentage of games won by players with different move quality in their first turn

In [13]:
thresholds = [-100, -75, -50, -25, -10]
for threshold in thresholds:
    number_games_win = len([
        entry["game_nr"] for entry in all_data
        if entry['result'] == 'win' and entry["ply"] < 2 and entry['move_quality'] > threshold
    ])
    total_games = len([
        entry["game_nr"] for entry in all_data
        if entry["ply"] < 2 and entry['move_quality'] > threshold
    ])
    percentage_games_won = number_games_win / total_games if total_games > 0 else 0
    print(f"Number of games won all elo levels with move_quality first turn over {threshold}: {number_games_win}")
    print(f"Percentage of games won all elo levels with move_quality first turn over {threshold}: {percentage_games_won}")
    print('')


Number of games won all elo levels with move_quality first turn over -100: 958
Percentage of games won all elo levels with move_quality first turn over -100: 0.4823766364551863

Number of games won all elo levels with move_quality first turn over -75: 920
Percentage of games won all elo levels with move_quality first turn over -75: 0.48092002090956615

Number of games won all elo levels with move_quality first turn over -50: 791
Percentage of games won all elo levels with move_quality first turn over -50: 0.47393648891551826

Number of games won all elo levels with move_quality first turn over -25: 566
Percentage of games won all elo levels with move_quality first turn over -25: 0.4502784407319014

Number of games won all elo levels with move_quality first turn over -10: 396
Percentage of games won all elo levels with move_quality first turn over -10: 0.47653429602888087




Average move_qualities of players with different elo levels for all phases of the game

In [14]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

for level in elo_levels:
    move_qualities = [ entry["move_quality"] for entry in all_data
        if entry["move_quality"] is not None and entry['elo_level'] == level
    ]
    avg_move_quality = sum(move_qualities) / len(move_qualities) if move_qualities else 0
    print(f"Average move_quality whole game {level}: {avg_move_quality}")

Average move_quality whole game beginner: -3.4766780432309443
Average move_quality whole game intermediate: -28.022786121180737
Average move_quality whole game club_player: -85.4731924741782
Average move_quality whole game advanced: -63.99712368168744
Average move_quality whole game expert: -92.48602106969206


Percentage of games won by players of different elo levels

In [8]:
elo_levels = ['beginner', 'intermediate', 'club_player', 'advanced', 'expert']
#white
for level in elo_levels:
    games = {entry["game_nr"] for entry in all_data if entry["ply"] < 1 and entry["elo_level"] == level}
    wins = {entry["game_nr"] for entry in all_data if entry["ply"] < 1 and entry["elo_level"] == level and entry['result'] == 'win'}
    
    num_games = len(games)
    num_wins = len(wins)
    percentage = num_wins / num_games if num_games > 0 else 0

    print(f"Number of games won {level}: {num_wins}")
    print(f"Percentage of games won {level}: {percentage}")
    print('')


Number of games won beginner: 0
Percentage of games won beginner: 0

Number of games won intermediate: 0
Percentage of games won intermediate: 0

Number of games won club_player: 1
Percentage of games won club_player: 0.5

Number of games won advanced: 0
Percentage of games won advanced: 0.0

Number of games won expert: 0
Percentage of games won expert: 0



Phase Opening

In [16]:
move_qualities_opening = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None and entry["phase"] == 'opening']
avg_move_quality_opening = sum(move_qualities_opening) / len(move_qualities_opening)
print(f"Average move_quality opening all elo levels: {avg_move_quality_opening}")

Average move_quality opening all elo levels: -46.518992120039634


In [17]:
elo_levels = ['beginner', 'intermediate', 'club_player', 'advanced', 'expert']

for level in elo_levels:
    move_qualities_opening = [
        entry["move_quality"]
        for entry in all_data
        if entry["move_quality"] is not None and
           entry["elo_level"] == level and
           entry["phase"] == "opening"
    ]

    if move_qualities_opening:  # schützt vor Division durch 0
        avg_move_quality = sum(move_qualities_opening) / len(move_qualities_opening)
        print(f"Average move_quality opening {level}: {avg_move_quality}")
    else:
        print(f"No data for opening phase in elo level {level}.")

Average move_quality opening beginner: -55.858823529411765
Average move_quality opening intermediate: -44.457758412006775
Average move_quality opening club_player: -45.939876957494405
Average move_quality opening advanced: -47.36063708759954
Average move_quality opening expert: -42.590250329380765


Phase Middlegame

In [18]:
move_qualities_middle = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None and entry["phase"] == 'middlegame']
avg_move_quality_middle = sum(move_qualities_middle) / len(move_qualities_middle)
print(f"Average move_quality middlegame all elo levels: {avg_move_quality_middle}")

Average move_quality middlegame all elo levels: -94.85671976017626


In [19]:
elo_levels = ['beginner', 'intermediate', 'club_player', 'advanced', 'expert']

for level in elo_levels:
    move_qualities_middlegame = [
        entry["move_quality"]
        for entry in all_data
        if entry["move_quality"] is not None and
           entry["elo_level"] == level and
           entry["phase"] == "middlegame"
    ]

    if move_qualities_middlegame:  # schützt vor Division durch 0
        avg_move_quality = sum(move_qualities_middlegame) / len(move_qualities_middlegame)
        print(f"Average move_quality middlegame {level}: {avg_move_quality}")
    else:
        print(f"No data for middlegame phase in elo level {level}.")

Average move_quality middlegame beginner: -107.48307515777395
Average move_quality middlegame intermediate: -113.97585698488759
Average move_quality middlegame club_player: -88.68519508284339
Average move_quality middlegame advanced: -87.92431491953023
Average move_quality middlegame expert: -92.68937468225724


Endgame

In [20]:
move_qualities_endgame = [entry["move_quality"] for entry in all_data if entry["move_quality"] is not None and entry["phase"] == 'endgame']
avg_move_quality_endgame = sum(move_qualities_endgame) / len(move_qualities_endgame)
print(f"Average move_quality endgame all elo levels: {avg_move_quality_endgame}")

Average move_quality endgame all elo levels: -34.6334765644293


In [25]:
elo_levels = ['beginner', 'intermediate', 'club_player', 'advanced', 'expert']

for level in elo_levels:
    move_qualities_endgame = [
        entry["move_quality"]
        for entry in all_data
        if entry["move_quality"] is not None and
           entry["elo_level"] == level and
           entry["phase"] == "endgame"
    ]

    if move_qualities_endgame:  # schützt vor Division durch 0
        avg_move_quality = sum(move_qualities_endgame) / len(move_qualities_endgame)
        print(f"Average move_quality endgame {level}: {avg_move_quality}")
    else:
        print(f"No data for endgame phase in elo level {level}.")

Average move_quality endgame beginner: 191.97445820433435
Average move_quality endgame intermediate: 106.8949494949495
Average move_quality endgame club_player: -122.4925285672429
Average move_quality endgame advanced: -48.65823161655255
Average move_quality endgame expert: -144.41006202618883


Percentages of games won with different ranges of avg move_quality

In [24]:
number = [-10000,-200, -150, -100, -50, 0]
number2  = [-200, -150, -100, -50, 0, 10000]

count = 0
count2 = 0
for n in range(6):
    for game_nr in range(10001):
        if game_nr == 0:
            continue
        move_qualities_dif_game_nr = [
        entry["move_quality"]
        for entry in all_data
        if entry["game_nr"] == game_nr
        and entry["player"] == 'white'
        and entry['result'] == 'win'
        ]
        #avg_move_quality_game_nr = None
        if move_qualities_dif_game_nr:
            avg_move_quality_game_nr = sum(move_qualities_dif_game_nr) / len(move_qualities_dif_game_nr)
        if avg_move_quality_game_nr <= number2[n] and avg_move_quality_game_nr > number[n]:
           count += 1

        move_qualities_dif_game_nr = [
        entry["move_quality"]
        for entry in all_data
        if entry["game_nr"] == game_nr
        and entry["player"] == 'white'
        ]
        #avg_move_quality_game_nr = None
        if move_qualities_dif_game_nr:
            avg_move_quality_game_nr = sum(move_qualities_dif_game_nr) / len(move_qualities_dif_game_nr)
        if avg_move_quality_game_nr <= number2[n] and avg_move_quality_game_nr > number[n]:
           count2 += 1
    percentage = count/count2 if count2 > 0 else 0
    print(f"Percentage of games won with average move_quality - {number[n]} < move_quality =< {number2[n]}: {percentage} ")



Percentage of games won with average move_quality - -10000 < move_quality =< -200: 0.9310344827586207 
Percentage of games won with average move_quality - -200 < move_quality =< -150: 0.9298245614035088 
Percentage of games won with average move_quality - -150 < move_quality =< -100: 0.9915730337078652 
Percentage of games won with average move_quality - -100 < move_quality =< -50: 1.0082236842105263 
Percentage of games won with average move_quality - -50 < move_quality =< 0: 1.0002042483660132 
Percentage of games won with average move_quality - 0 < move_quality =< 10000: 1.0 


•Opening Theory Compliance – Do players follow book openings correctly, or do they deviate early?

•Strategic Conformance – How well do players adhere to established positional and tactical guidelines?

•Endgame Fundamentals – Are winning endgame techniques executed correctly, or do deviations impact results?

•Skill-Based Differences – Do higher-rated players exhibit greater conformance, and how does deviation affect performance

In [1]:
import json

DATA_FILE = "chess_data_opening.json"

# Load the data
with open(DATA_FILE, "r") as f:
    data = json.load(f)

In [28]:
# Initialize counters
total_quality = 0
count = 0
# Iterate through games
for game in data:
    moves = game.get("moves", [])
    for move in moves:
        mq = move.get("move_quality")
        if mq is not None:
            total_quality += mq
            count += 1

# Calculate and print average
if count > 0:
    avg_quality = total_quality / count
    print(f"Average move quality across all games: {avg_quality:.2f}")
else:
    print("No move_quality data found.")

Average move quality across all games: -38.30


In [29]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

for level in elo_levels:
    total_quality = 0
    count = 0

    # Iterate through games
    for game in data:
        if game.get("elo_level_white") == level:
            moves = game.get("moves", [])
            for move in moves:
                if move.get("player") == "white":
                    mq = move.get("move_quality")
                    if mq is not None:
                        total_quality += mq
                        count += 1

    # Calculate and print average
    if count > 0:
        avg_quality_white = total_quality / count
        print(f"White - {level}: Average move quality = {avg_quality_white:.2f}")
    else:
        print(f"White - {level}: No move_quality data found.")


White - beginner: Average move quality = -59.60
White - intermediate: Average move quality = -38.34
White - club_player: Average move quality = -37.55
White - advanced: Average move quality = -33.14
White - expert: Average move quality = -40.21


In [30]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

for level in elo_levels:
    total_quality = 0
    count = 0

    # Iterate through games
    for game in data:
        if game.get("elo_level_black") == level:
            moves = game.get("moves", [])
            for move in moves:
                if move.get("player") == "black":
                    mq = move.get("move_quality")
                    if mq is not None:
                        total_quality += mq
                        count += 1

    # Calculate and print average
    if count > 0:
        avg_quality = total_quality / count
        print(f"Black - {level}: Average move quality = {avg_quality:.2f}")
    else:
        print(f"Black - {level}: No move_quality data found.")


Black - beginner: Average move quality = -35.67
Black - intermediate: Average move quality = -32.83
Black - club_player: Average move quality = -40.60
Black - advanced: Average move quality = -41.06
Black - expert: Average move quality = -41.97


In [31]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

for level in elo_levels:
    total_quality = 0
    count = 0

    for game in data:
        # For white player
        if game.get("elo_level_white") == level:
            for move in game.get("moves", []):
                if move.get("player") == "white":
                    mq = move.get("move_quality")
                    if mq is not None:
                        total_quality += mq
                        count += 1

        # For black player
        if game.get("elo_level_black") == level:
            for move in game.get("moves", []):
                if move.get("player") == "black":
                    mq = move.get("move_quality")
                    if mq is not None:
                        total_quality += mq
                        count += 1

    # Output combined result
    if count > 0:
        avg_quality = total_quality / count
        print(f"{level}: Combined average move quality (white + black) = {avg_quality:.2f}")
    else:
        print(f"{level}: No move_quality data found.")


beginner: Combined average move quality (white + black) = -46.73
intermediate: Combined average move quality (white + black) = -35.58
club_player: Combined average move quality (white + black) = -39.03
advanced: Combined average move quality (white + black) = -36.96
expert: Combined average move quality (white + black) = -41.01


In [32]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

# Initialize counters
wins_by_level = {level: 0 for level in elo_levels}
games_by_level = {level: 0 for level in elo_levels}

for game in data:
    result = game.get("result")
    white_level = game.get("elo_level_white")
    black_level = game.get("elo_level_black")

    # Count white player's result
    if white_level in elo_levels:
        games_by_level[white_level] += 1
        if result == "1-0":  # white wins
            wins_by_level[white_level] += 1

    # Count black player's result
    if black_level in elo_levels:
        games_by_level[black_level] += 1
        if result == "0-1":  # black wins
            wins_by_level[black_level] += 1

# Output results
for level in elo_levels:
    wins = wins_by_level[level]
    total = games_by_level[level]
    percentage = wins / total if total > 0 else 0
    print(f"Number of games won by {level}: {wins}")
    print(f"Percentage of games won by {level}: {percentage:.2%}\n")


Number of games won by beginner: 18
Percentage of games won by beginner: 46.15%

Number of games won by intermediate: 63
Percentage of games won by intermediate: 48.09%

Number of games won by club_player: 124
Percentage of games won by club_player: 49.21%

Number of games won by advanced: 98
Percentage of games won by advanced: 47.80%

Number of games won by expert: 24
Percentage of games won by expert: 43.64%



In [43]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

# Initialize counters: player_vs_opponent[player_level][opponent_level] = {"wins": 0, "total": 0}
player_vs_opponent = {
    pl: {op: {"wins": 0, "total": 0} for op in elo_levels}
    for pl in elo_levels
}

for game in data:
    result = game.get("result")
    white_level = game.get("elo_level_white")
    black_level = game.get("elo_level_black")

    # White player perspective
    if white_level in elo_levels and black_level in elo_levels:
        player_vs_opponent[white_level][black_level]["total"] += 1
        if result == "1-0":
            player_vs_opponent[white_level][black_level]["wins"] += 1

    # Black player perspective
    if black_level in elo_levels and white_level in elo_levels:
        player_vs_opponent[black_level][white_level]["total"] += 1
        if result == "0-1":
            player_vs_opponent[black_level][white_level]["wins"] += 1

# Print matrix
print("Win Rate Matrix (Player vs Opponent):\n")
header = "Player \\ Opponent".ljust(18) + "".join(f"{op.ljust(14)}" for op in elo_levels)
print(header)
print("-" * len(header))
for pl in elo_levels:
    row = pl.ljust(18)
    for op in elo_levels:
        matchup = player_vs_opponent[pl][op]
        total = matchup["total"]
        wins = matchup["wins"]
        win_rate = (wins / total * 100) if total > 0 else 0
        row += f"{win_rate:6.1f}%".ljust(14)
    print(row)


Win Rate Matrix (Player vs Opponent):

Player \ Opponent beginner      intermediate  club_player   advanced      expert        
----------------------------------------------------------------------------------------
beginner            46.9%         50.0%         33.3%          0.0%          0.0%       
intermediate        50.0%         49.2%         47.1%          0.0%          0.0%       
club_player         66.7%         52.9%         49.2%         44.4%          0.0%       
advanced             0.0%        100.0%         55.6%         46.8%         50.0%       
expert               0.0%        100.0%          0.0%         41.7%         43.5%       


In [13]:
import numpy as np
from collections import defaultdict

# Define move quality bins (low to high)
bins = [-10000, -200, -150, -100, -50, 0, 10000]

# Stats for bin ranges
bin_stats_white = defaultdict(lambda: {"wins": 0, "total": 0})
bin_stats_black = defaultdict(lambda: {"wins": 0, "total": 0})

for game in data:
    moves = game.get("moves", [])
    result = game.get("result", "*")

    # Separate white and black move_qualities
    white_qualities = [m["move_quality"] for m in moves if m["player"] == "white" and m.get("move_quality") is not None]
    black_qualities = [m["move_quality"] for m in moves if m["player"] == "black" and m.get("move_quality") is not None]

    if not white_qualities or not black_qualities:
        continue

    avg_white = np.mean(white_qualities)
    avg_black = np.mean(black_qualities)

    # Bin index
    w_bin = np.digitize(avg_white, bins) - 1
    b_bin = np.digitize(avg_black, bins) - 1
    w_range = f"{bins[w_bin]} < mq <= {bins[w_bin + 1]}"
    b_range = f"{bins[b_bin]} < mq <= {bins[b_bin + 1]}"

    # Update stats
    bin_stats_white[w_range]["total"] += 1
    bin_stats_black[b_range]["total"] += 1

    if result == "1-0":
        bin_stats_white[w_range]["wins"] += 1
    elif result == "0-1":
        bin_stats_black[b_range]["wins"] += 1

# Print white results
print("White player win percentage by avg move_quality:")
for rng, stats in sorted(bin_stats_white.items()):
    pct = stats["wins"] / stats["total"] if stats["total"] > 0 else 0
    print(f"  {rng}: {pct:.2%} ({stats['wins']} wins / {stats['total']} games)")

# Print black results
print("\nBlack player win percentage by avg move_quality:")
for rng, stats in sorted(bin_stats_black.items()):
    pct = stats["wins"] / stats["total"] if stats["total"] > 0 else 0
    print(f"  {rng}: {pct:.2%} ({stats['wins']} wins / {stats['total']} games)")


White player win percentage by avg move_quality:
  -50 < mq <= 0: 50.00% (1 wins / 2 games)

Black player win percentage by avg move_quality:
  -50 < mq <= 0: 50.00% (1 wins / 2 games)


Opening Sequence

In [36]:
from collections import Counter

sequence_counts = Counter()

for game in data:
    seq = game.get("human_opening_sequence", "")
    moves = seq.strip().split()
    #first_5_moves = moves[:10]  # first 5 full moves = 10 plies
    first_5_moves = moves[:5]  # first 5 half moves
    seq_key = " ".join(first_5_moves)
    sequence_counts[seq_key] += 1

# Print sequences and how many times they appeared
for seq, count in sequence_counts.most_common():
    if count > 4:
        print(f"Sequence: {seq}\nCount: {count}\n")


Sequence: e4 e5 Nf3 Nc6 Bc4
Count: 11

Sequence: e4 d5 exd5 Qxd5 Nc3
Count: 9

Sequence: e4 e5 Nf3 Nc6 d4
Count: 8

Sequence: 
Count: 8

Sequence: e4 e5 Nf3 Nf6 Bc4
Count: 5

Sequence: e4 e5 Nf3 d6 Bc4
Count: 5

Sequence: e4 e5 Nf3 Nc6 Bb5
Count: 5



In [42]:
import json
import pandas as pd
from collections import Counter

# === Extract records with first 5 half-moves and ELO info ===
records = []

for game in data:
    seq = game.get("human_opening_sequence", "")
    moves = seq.strip().split()

    if len(moves) < 5:
        continue  # Skip if fewer than 5 plies

    first_5_half_moves = moves[:5]
    seq_key = " ".join(first_5_half_moves)

    records.append({
        "sequence": seq_key,
        "white_elo_level": game.get("elo_level_white", "unknown"),
        "black_elo_level": game.get("elo_level_black", "unknown"),
    })



In [3]:
records

[{'sequence': 'e3 c6 f4 d6 Nf3',
  'white_elo_level': 'club_player',
  'black_elo_level': 'club_player'},
 {'sequence': 'e4 c5 Nf3 d6 c3',
  'white_elo_level': 'advanced',
  'black_elo_level': 'club_player'},
 {'sequence': 'e4 d6 d4 e6 Nf3',
  'white_elo_level': 'club_player',
  'black_elo_level': 'club_player'},
 {'sequence': 'd4 f5 Nc3 g6 e4',
  'white_elo_level': 'advanced',
  'black_elo_level': 'advanced'},
 {'sequence': 'd4 d6 Nf3 g6 Bg5',
  'white_elo_level': 'club_player',
  'black_elo_level': 'club_player'},
 {'sequence': 'e4 c6 d4 Qc7 c4',
  'white_elo_level': 'club_player',
  'black_elo_level': 'club_player'},
 {'sequence': 'e4 d5 Nf3 d4 c3',
  'white_elo_level': 'beginner',
  'black_elo_level': 'beginner'},
 {'sequence': 'e4 c5 Nf3 Nc6 d4',
  'white_elo_level': 'advanced',
  'black_elo_level': 'club_player'},
 {'sequence': 'd4 d5 Bf4 Bf5 e3',
  'white_elo_level': 'intermediate',
  'black_elo_level': 'intermediate'},
 {'sequence': 'e4 d5 exd5 Nf6 Nc3',
  'white_elo_level': 'a

In [4]:
import glob
import pandas as pd
import re

# === Step 1: Load ECO openings from TSV files (a.tsv to e.tsv) ===
eco_openings = []

def pgn_to_moves(pgn_str):
    # Remove move numbers and dots, split by spaces
    # Example: "1. d4 Nf6 2. c4 e6 3. g3" -> ["d4", "Nf6", "c4", "e6", "g3"]
    cleaned = re.sub(r'\d+\.', '', pgn_str).strip()
    moves = cleaned.split()
    return moves

for file_path in glob.glob("/workspaces/Chess_Stockfish/[a-e].tsv"):
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            eco_code, name, pgn = parts
            moves = pgn_to_moves(pgn)
            eco_openings.append({
                "code": eco_code,
                "name": name,
                "moves": moves
            })

# === Step 2: Match each record sequence to ECO openings ===
def match_opening(sequence, eco_openings):
    seq_moves = sequence.split()
    best_match = None
    best_length = 0
    
    for eco in eco_openings:
        eco_moves = eco["moves"]
        if len(seq_moves) >= len(eco_moves) and seq_moves[:len(eco_moves)] == eco_moves:
            # Check if this match is longer (more specific) than previous matches
            if len(eco_moves) > best_length:
                best_length = len(eco_moves)
                best_match = eco["name"]
    return best_match if best_match else "Unknown"


# === Step 3: Add opening names to records ===
for rec in records:
    opening_name = match_opening(rec['sequence'], eco_openings)
    rec['opening_name'] = opening_name if opening_name else "Unknown"

# === Optional: Convert to DataFrame for easier analysis ===
df = pd.DataFrame(records)
print(df)


               sequence white_elo_level black_elo_level  \
0       e3 c6 f4 d6 Nf3     club_player     club_player   
1       e4 c5 Nf3 d6 c3        advanced     club_player   
2       e4 d6 d4 e6 Nf3     club_player     club_player   
3       d4 f5 Nc3 g6 e4        advanced        advanced   
4      d4 d6 Nf3 g6 Bg5     club_player     club_player   
..                  ...             ...             ...   
353   e4 e5 Nf3 Nc6 Bb5     club_player     club_player   
354   f4 e5 e4 exf4 Nf3    intermediate    intermediate   
355  f4 e5 e3 exf4 exf4     club_player     club_player   
356    e4 c5 Nc3 e6 Nf3        advanced        advanced   
357      e4 e6 d4 c5 d5     club_player     club_player   

                                          opening_name  
0                                 Van't Kruijs Opening  
1    Sicilian Defense: Delayed Alapin Variation, wi...  
2                                         Pirc Defense  
3                     Dutch Defense: Raphael Variation  
4     

In [25]:
df_filtered = df[
    (df['white_elo_level'] == 'beginner') &
    (df['opening_count'] > 4)
]

print(df_filtered[['sequence', 'opening_name']])


df_filtered = df[
    (df['white_elo_level'] == 'intermediate') &
    (df['opening_count'] > 10)
]

print(df_filtered[['sequence', 'opening_name']])

              sequence      opening_name
106   d4 Nf6 Bf4 d5 e3    Indian Defense
122  e4 d6 Nf3 Be6 Nc3      Pirc Defense
147   e4 e5 Nf3 d6 Bc4  Philidor Defense
163  e4 e5 Nf3 Nc6 Bc4      Italian Game
230   e4 d6 d4 Nf6 Nd2      Pirc Defense
253    e4 e5 Bc4 d6 d3  Bishop's Opening
                sequence                                   opening_name
8       d4 d5 Bf4 Bf5 e3   Queen's Pawn Game: Accelerated London System
218  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
233     d4 d5 Bf4 e6 Nc3   Queen's Pawn Game: Accelerated London System
270    e4 e5 Nf3 Nc6 Bc4                                   Italian Game
313     e4 d6 d4 Nf6 Nc3                                   Pirc Defense


In [29]:
elo_levels = ["beginner", "intermediate", "club_player", "advanced", "expert"]

df_filtered = df[
    (df['white_elo_level'] == 'club_player') &
    (df['opening_count'] > 11)
]

print(df_filtered[['sequence', 'opening_name']])


                sequence                                   opening_name
26      d4 d5 Bf4 Nc6 e3   Queen's Pawn Game: Accelerated London System
65    e4 d5 exd5 Qxd5 c4  Scandinavian Defense: Mieses-Kotroc Variation
86   e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
117  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
197   e4 d5 exd5 Qxd5 d4  Scandinavian Defense: Mieses-Kotroc Variation
236    d4 d5 Bf4 Nc6 Nf3   Queen's Pawn Game: Accelerated London System
237  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
239     d4 d5 Bf4 Bf5 e3   Queen's Pawn Game: Accelerated London System
252  e4 d5 exd5 Qxd5 Nf3  Scandinavian Defense: Mieses-Kotroc Variation
267      d4 d5 Bf4 h6 e3   Queen's Pawn Game: Accelerated London System
282    d4 d5 Bf4 Nc6 Nf3   Queen's Pawn Game: Accelerated London System
300  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
345  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Va

In [34]:
df_filtered = df[
    (df['white_elo_level'] == 'advanced') &
    (df['opening_count'] > 11)
]

print(df_filtered[['sequence', 'opening_name']])

df_filtered = df[
    (df['white_elo_level'] == 'expert') &
    (df['opening_count'] > 5)
]

print(df_filtered[['sequence', 'opening_name']])

                sequence                                   opening_name
30     d4 d5 Bf4 Bf5 Nf3   Queen's Pawn Game: Accelerated London System
47     d4 d5 Bf4 Bf5 Nf3   Queen's Pawn Game: Accelerated London System
126  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
179     d4 d5 Bf4 Nf6 e3   Queen's Pawn Game: Accelerated London System
265     d4 d5 Bf4 Nf6 e3   Queen's Pawn Game: Accelerated London System
287  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
315  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation
             sequence                                  opening_name
140  d4 d5 Bf4 Nf6 e3  Queen's Pawn Game: Accelerated London System
177  e3 d5 Nc3 Nf6 d4                          Van't Kruijs Opening
195  d4 Nf6 Nf3 d5 g3             Indian Defense: Knights Variation
290  d4 c6 Nc3 d5 Bf4                             Queen's Pawn Game
307    d4 e6 c4 b6 e4                               English Defense
321  e4 e5 Nf3 N

In [37]:
# Assuming df has columns: 'sequence', 'opening_name'
# Create a DataFrame from sequence_counts for counts per sequence
import pandas as pd

# Convert sequence_counts (a dict or Counter) to DataFrame
seq_counts_df = pd.DataFrame(sequence_counts.items(), columns=['sequence', 'sequence_count'])

# Merge with your df to get opening_name for each sequence
# Drop duplicates so each sequence-opening pair is unique
seq_opening_df = df[['sequence', 'opening_name']].drop_duplicates()

# Merge counts with openings
summary_df = pd.merge(seq_opening_df, seq_counts_df, on='sequence', how='left')

# Filter sequences that occur more than 5 times
summary_df = summary_df[summary_df['sequence_count'] > 4]

# Sort by count descending
summary_df = summary_df.sort_values(by='sequence_count', ascending=False).reset_index(drop=True)

print(summary_df)


              sequence                                   opening_name  \
0    e4 e5 Nf3 Nc6 Bc4                                   Italian Game   
1  e4 d5 exd5 Qxd5 Nc3  Scandinavian Defense: Mieses-Kotroc Variation   
2     e4 e5 Nf3 Nc6 d4                                    Scotch Game   
3    e4 e5 Nf3 Nf6 Bc4            Petrov's Defense: Italian Variation   
4     e4 e5 Nf3 d6 Bc4                               Philidor Defense   
5    e4 e5 Nf3 Nc6 Bb5                                      Ruy Lopez   

   sequence_count  
0              11  
1               9  
2               8  
3               5  
4               5  
5               5  


In [1]:
import chess.pgn

def get_elo_bucket(elo):
    if elo < 1000:
        return "beginner"
    elif elo < 1400:
        return "intermediate"
    elif elo < 1800:
        return "club_player"
    elif elo < 2200:
        return "advanced"
    else:
        return "expert"

# Path to PGN file
PGN_FILE = "/workspaces/Chess_Stockfish/cleaned_one_line.pgn"

results = []

with open(PGN_FILE, encoding="utf-8") as pgn:
    while True:
        game = chess.pgn.read_game(pgn)
        if game is None:
            break

        white_elo = int(game.headers.get("WhiteElo", 0))
        black_elo = int(game.headers.get("BlackElo", 0))
        opening = game.headers.get("Opening", "Unknown")

        white_level = get_elo_bucket(white_elo)
        black_level = get_elo_bucket(black_elo)

        results.append({
            "white_level": white_level,
            "black_level": black_level,
            "opening": opening
        })

# Optional: Print or export
for r in results:
    print(r)


{'white_level': 'club_player', 'black_level': 'club_player', 'opening': 'Ware Opening: Crab Variation'}
{'white_level': 'advanced', 'black_level': 'club_player', 'opening': 'Caro-Kann Defense: Panov Attack'}
{'white_level': 'club_player', 'black_level': 'club_player', 'opening': "Zukertort Opening: Queen's Gambit Invitation"}
{'white_level': 'advanced', 'black_level': 'advanced', 'opening': "Queen's Gambit Declined: Semi-Tarrasch Defense"}
{'white_level': 'club_player', 'black_level': 'club_player', 'opening': 'Modern Defense: Bishop Attack'}
{'white_level': 'club_player', 'black_level': 'club_player', 'opening': "Queen's Gambit Accepted: Rosenthal Variation"}
{'white_level': 'beginner', 'black_level': 'beginner', 'opening': 'Goldsmith Defense'}
{'white_level': 'advanced', 'black_level': 'club_player', 'opening': 'Ruy Lopez: Steinitz Defense'}
{'white_level': 'intermediate', 'black_level': 'intermediate', 'opening': "Queen's Pawn Game: Accelerated London System"}
{'white_level': 'advan

In [3]:
from collections import Counter

# Step 1: Filter beginner-white games from `results`
beginner_white = [r for r in results if r["white_level"] == "beginner"]

# Step 2: Count openings in that filtered list
opening_counts = Counter(r["opening"] for r in beginner_white)

# Step 3: Filter games where opening appears > 4 times
filtered_results = [
    r for r in beginner_white
    if opening_counts[r["opening"]] > 10
]

# Step 4: Display
print(f"Total beginner-white games: {len(beginner_white)}")
print("Beginner-white games with common openings (>10 times):")
for r in filtered_results:
    print(f"{r['opening']} (white_level={r['white_level']}, black_level={r['black_level']})")


Total beginner-white games: 282
Beginner-white games with common openings (>10 times):
Queen's Pawn Game (white_level=beginner, black_level=intermediate)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)
Queen's Pawn Game (white_level=beginner, black_level=beginner)


In [None]:
from collections import Counter

# Step 1: Filter beginner-white games
beginner = [r for r in results if r["white_level"] == "beginner" and r["black_level"] == "beginner"]

# Step 2: Count openings
opening_counts = Counter(r["opening"] for r in beginner)

# Step 3: Print only those with >5 occurrences
print("Openings with a frequency of more than 5 only beginners:")
for opening, count in opening_counts.items():
    if count > 5:
        print(opening)


Openings with more than 5 beginner-white games:
Philidor Defense
Scandinavian Defense
Four Knights Game: Italian Variation
Queen's Pawn Game


In [15]:
# Step 1: Filter intermediate-white games
intermediate = [r for r in results if r["white_level"] == "intermediate" and r["black_level"] == "intermediate"]

# Step 2: Count openings
opening_counts = Counter(r["opening"] for r in intermediate)

# Step 3: Print only those with >5 occurrences
print("Openings with a frequency of more than 17 only intermediate:")
for opening, count in opening_counts.items():
    if count > 17:
        print(opening)

Openings with a frequency of more than 17 only intermediate:
Queen's Pawn Game: Accelerated London System
Queen's Pawn Game
French Defense: Knight Variation
Van't Kruijs Opening
Philidor Defense
