# Merge Scraped Data

In [None]:
import os

# Define the directory containing the PGN files
input_directory = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus PGN data'
output_file = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus_PGN_merged.pgn'

# Get a list of all PGN files in the directory
pgn_files = [f for f in os.listdir(input_directory) if f.endswith('.pgn')]

# Initialize a list to store the content of all PGN files
all_games = []

# Loop through each PGN file and read its content
for pgn_file in pgn_files:
    with open(os.path.join(input_directory, pgn_file), 'r', encoding='utf-8') as file:
        all_games.append(file.read())

# Write the combined content to the output file
with open(output_file, 'w', encoding='utf-8') as file:
    for game in all_games:
        file.write(game + '\n\n')  # Ensure each game is separated by a blank line

print(f'Merged {len(pgn_files)} PGN files into {output_file}')

# Filter Games with timestamp

In [None]:
import re

# Paths to the input merged PGN file and the output filtered PGN file
merged_pgn_file = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus_PGN_merged.pgn'
filtered_pgn_file = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus_PGN_filtered.pgn'

# Function to check if a game has timestamps
def has_timestamps(game):
    return re.search(r'\[%clk \d{1,2}:\d{2}(:\d{2})?\]', game) is not None

# Read the merged PGN file
with open(merged_pgn_file, 'r', encoding='utf-8') as file:
    merged_pgn_content = file.read()

# Split the content into individual games based on '[Event ' tag
games = re.split(r'(\[Event .+?\n)', merged_pgn_content)
# This will create a list where every event and game are separate elements. We need to combine them correctly.

# Combine event tags with their corresponding games
combined_games = []
for i in range(0, len(games) - 1, 2):
    combined_game = games[i] + games[i + 1]
    combined_games.append(combined_game)

# Filter games that contain timestamps
filtered_games = [game for game in combined_games if has_timestamps(game)]

# Write the filtered games to the new PGN file
with open(filtered_pgn_file, 'w', encoding='utf-8') as file:
    for game in filtered_games:
        file.write(game.strip() + '\n\n')  # Ensure each game is separated by a blank line

print(f'Filtered games with timestamps have been saved to {filtered_pgn_file}')


# Feature eng

Creating Data Frame

In [1]:
import chess
import chess.pgn
import pandas as pd
import numpy as np
import re

# Function to parse PGN file and extract required data
def parse_pgn_file(pgn_file):
    df_data = []

    with open(pgn_file, 'r') as file:
        while True:
            game = chess.pgn.read_game(file)
            if game is None:
                break

            # Extract time control from headers
            time_control = game.headers.get("TimeControl", None)
            if time_control:
                initial_time = time_control.split('+')[0]
                initial_time = int(initial_time) if initial_time.isdigit() else None
            else:
                initial_time = None

            white_move_count = 1
            black_move_count = 1

            board = game.board()
            prev_time_white = f"0:{initial_time // 60:02}:{initial_time % 60:02}" if initial_time else None
            prev_time_black = f"0:{initial_time // 60:02}:{initial_time % 60:02}" if initial_time else None

            for node in game.mainline():
                move = node.move
                fen = board.fen()
                color = int(board.turn)
                move_san = str(board.piece_at(move.from_square)).upper() + \
                           chess.SQUARE_NAMES[move.from_square] + \
                           chess.SQUARE_NAMES[move.to_square]
                move_count = white_move_count if color == 0 else black_move_count

                # Extract timestamps from comment
                comment = node.comment
                time_left_white = prev_time_white
                time_left_black = prev_time_black
                if comment:
                    times = re.findall(r'\[%clk ([\d:]+)\]', comment)
                    if len(times) == 1:
                        if color == 0:
                            time_left_black = times[0]
                        else:
                            time_left_white = times[0]
                    elif len(times) == 2:
                        time_left_white = times[0]
                        time_left_black = times[1]

                df_data.append((fen, move_san, color, move_count, time_left_white, time_left_black))

                if color == 0:
                    white_move_count += 1
                    prev_time_black = time_left_black
                else:
                    black_move_count += 1
                    prev_time_white = time_left_white

                board.push(move)

    return df_data

# Path to the filtered PGN file
filtered_pgn_file = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus_PGN_filtered.pgn'

# Parse the PGN file and get DataFrame data
df_data = parse_pgn_file(filtered_pgn_file)

# Create DataFrame
df = pd.DataFrame(df_data, columns=['Previous_FEN', 'Next_Move', 'Color', 'Move_Count', 'Time_Left_White', 'Time_Left_Black'])

In [2]:
# Convert time left columns to seconds
df['Time_Left_White'] = df['Time_Left_White'].apply(lambda x: None if x is None else sum(int(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
df['Time_Left_Black'] = df['Time_Left_Black'].apply(lambda x: None if x is None else sum(int(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))

In [3]:
# Display DataFrame
print(df)

                                            Previous_FEN Next_Move  Color  \
0      rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...     Pd2d4      1   
1      rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR ...     Ng8f6      0   
2      rnbqkb1r/pppppppp/5n2/8/3P4/8/PPP1PPPP/RNBQKBN...     Pc2c4      1   
3      rnbqkb1r/pppppppp/5n2/8/2PP4/8/PP2PPPP/RNBQKBN...     Pe7e6      0   
4      rnbqkb1r/pppp1ppp/4pn2/8/2PP4/8/PP2PPPP/RNBQKB...     Ng1f3      1   
...                                                  ...       ...    ...   
91003       2r5/6p1/5p2/2K2Pk1/1P5p/7P/5RP1/8 w - - 5 51     Kc5d6      1   
91004       2r5/6p1/3K1p2/5Pk1/1P5p/7P/5RP1/8 b - - 6 51     Rc8b8      0   
91005       1r6/6p1/3K1p2/5Pk1/1P5p/7P/5RP1/8 w - - 7 52     Rf2b2      1   
91006      1r6/6p1/3K1p2/5Pk1/1P5p/7P/1R4P1/8 b - - 8 52     Kg5f5      0   
91007       1r6/6p1/3K1p2/5k2/1P5p/7P/1R4P1/8 w - - 0 53     Pb4b5      1   

       Move_Count  Time_Left_White  Time_Left_Black  
0               1    

In [4]:
# Function to calculate the time taken for each move
def calculate_time_taken(df, increment):
    time_taken = []
    prev_time_white = None
    prev_time_black = None

    for index, row in df.iterrows():
        if row['Color'] == 1:  # White move
            if prev_time_white is None or row['Time_Left_White'] is None:
                time_taken.append(None)
            else:
                time_taken.append(prev_time_white + increment - row['Time_Left_White'])
            prev_time_white = row['Time_Left_White']
        else:  # Black move
            if prev_time_black is None or row['Time_Left_Black'] is None:
                time_taken.append(None)
            else:
                time_taken.append(prev_time_black + increment - row['Time_Left_Black'])
            prev_time_black = row['Time_Left_Black']

    df['Time_Taken'] = time_taken
    return df

# Extract the increment part from the time control, default to 0 if not present or invalid
time_control = chess.pgn.read_headers(open(filtered_pgn_file, 'r')).get("TimeControl", "0+0")
increment = int(time_control.split('+')[1]) if '+' in time_control and time_control.split('+')[1].isdigit() else 0

# Calculate the time taken and add the new column to the DataFrame
df = calculate_time_taken(df, increment)

# Set Time_Taken to 1 second for all rows where Move_Count is 1
df.loc[df['Move_Count'] == 1, 'Time_Taken'] = 1

# Display the updated DataFrame
print(df.head())

                                        Previous_FEN Next_Move  Color  \
0  rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...     Pd2d4      1   
1  rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR ...     Ng8f6      0   
2  rnbqkb1r/pppppppp/5n2/8/3P4/8/PPP1PPPP/RNBQKBN...     Pc2c4      1   
3  rnbqkb1r/pppppppp/5n2/8/2PP4/8/PP2PPPP/RNBQKBN...     Pe7e6      0   
4  rnbqkb1r/pppp1ppp/4pn2/8/2PP4/8/PP2PPPP/RNBQKB...     Ng1f3      1   

   Move_Count  Time_Left_White  Time_Left_Black  Time_Taken  
0           1             1519           1500.0         1.0  
1           1             1519           1518.0         1.0  
2           2             1527           1518.0         2.0  
3           2             1527           1526.0         2.0  
4           3             1535           1526.0         2.0  


Drop rows where Time_Taken is below 0

In [6]:
# Drop rows where Time_Taken is below 0
df = df.drop(df[df['Time_Taken'] < 0].index)

# Display the updated DataFrame
print(df.head())

                                        Previous_FEN Next_Move  Color  \
0  rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w ...     Pd2d4      1   
1  rnbqkbnr/pppppppp/8/8/3P4/8/PPP1PPPP/RNBQKBNR ...     Ng8f6      0   
2  rnbqkb1r/pppppppp/5n2/8/3P4/8/PPP1PPPP/RNBQKBN...     Pc2c4      1   
3  rnbqkb1r/pppppppp/5n2/8/2PP4/8/PP2PPPP/RNBQKBN...     Pe7e6      0   
4  rnbqkb1r/pppp1ppp/4pn2/8/2PP4/8/PP2PPPP/RNBQKB...     Ng1f3      1   

   Move_Count  Time_Left_White  Time_Left_Black  Time_Taken  
0           1             1519           1500.0         1.0  
1           1             1519           1518.0         1.0  
2           2             1527           1518.0         2.0  
3           2             1527           1526.0         2.0  
4           3             1535           1526.0         2.0  


In [7]:
# Function to convert move to label
def move_to_label(move):
    from_square = chess.parse_square(move[1:3])
    to_square = chess.parse_square(move[3:])
    return from_square * 64 + to_square

# Convert Next_Move column to labels
df['Next_Move'] = df['Next_Move'].apply(move_to_label)

In [8]:
# Function to convert board state to one-hot encoding
def board_to_one_hot(fen):
    board = chess.Board(fen)
    one_hot = np.zeros((8, 8, 12), dtype=np.int8)
    piece_map = {chess.PAWN: 0, chess.KNIGHT: 1, chess.BISHOP: 2,
                 chess.ROOK: 3, chess.QUEEN: 4, chess.KING: 5}
    for square in chess.scan_reversed(chess.BB_ALL):
        piece = board.piece_at(square)
        if piece is not None:
            piece_index = piece_map[piece.piece_type] + (6 if piece.color else 0)
            one_hot[chess.square_rank(square), chess.square_file(square), piece_index] = 1
    return one_hot

# Convert Previous_FEN column to one-hot encoding
df['Previous_FEN'] = df['Previous_FEN'].apply(board_to_one_hot)

In [9]:
# Display DataFrame
print(df)

                                            Previous_FEN  Next_Move  Color  \
0      [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0,...        731      1   
1      [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0,...       4013      0   
2      [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0,...        666      1   
3      [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0,...       3372      0   
4      [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0,...        405      1   
...                                                  ...        ...    ...   
91003  [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0,...       2219      1   
91004  [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0,...       3769      0   
91005  [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0,...        841      1   
91006  [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0,...       2469      0   
91007  [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0,...       1633      1   

       Move_Count  Time_Left_White  Time_Left_Black  Time_Taken

Using only target player information

In [16]:
# Drop rows where Color column is equal to 0
df = df[df['Color'] != 0]

Check for null values

In [22]:
# Check for null values
null_values = df.isnull().sum()
print(null_values)

Previous_FEN        0
Next_Move           0
Color               0
Move_Count          0
Time_Left_White     0
Time_Left_Black    79
Time_Taken          0
dtype: int64


In [23]:
# Print statistical information from dataset
print(df.describe())

          Next_Move    Color    Move_Count  Time_Left_White  Time_Left_Black  \
count  44592.000000  44592.0  44592.000000     44592.000000     44513.000000   
mean    1254.113361      1.0     27.308508       407.443936       369.718936   
std      929.930095      0.0     19.265612       937.915071       860.394275   
min        1.000000      1.0      1.000000         0.000000         0.000000   
25%      454.000000      1.0     12.000000        51.000000        29.000000   
50%     1080.000000      1.0     24.000000       141.000000       104.000000   
75%     1828.000000      1.0     38.000000       344.000000       289.000000   
max     4094.000000      1.0    135.000000     70758.000000      8261.000000   

         Time_Taken  
count  44592.000000  
mean      28.433060  
std      347.362455  
min        0.000000  
25%       10.000000  
50%       11.000000  
75%       16.000000  
max    70763.000000  


In [24]:
#Drop null values
df = df.dropna()

In [25]:
# Check for null values
null_values = df.isnull().sum()
print(null_values)

Previous_FEN       0
Next_Move          0
Color              0
Move_Count         0
Time_Left_White    0
Time_Left_Black    0
Time_Taken         0
dtype: int64


In [26]:
# Print statistical information from dataset
print(df.describe())

          Next_Move    Color    Move_Count  Time_Left_White  Time_Left_Black  \
count  44513.000000  44513.0  44513.000000     44513.000000     44513.000000   
mean    1255.004223      1.0     27.355200       403.896435       369.718936   
std      930.508341      0.0     19.250765       929.802600       860.394275   
min        1.000000      1.0      1.000000         0.000000         0.000000   
25%      454.000000      1.0     12.000000        50.000000        29.000000   
50%     1096.000000      1.0     24.000000       140.000000       104.000000   
75%     1828.000000      1.0     38.000000       341.000000       289.000000   
max     4094.000000      1.0    135.000000     70758.000000      8261.000000   

         Time_Taken  
count  44513.000000  
mean      28.481747  
std      347.668644  
min        0.000000  
25%       10.000000  
50%       11.000000  
75%       16.000000  
max    70763.000000  


In [33]:
# Path to save the CSV file
csv_file_path = r'C:\Users\edulo\OneDrive\Documentos\Loyalist\AIP\Week 4\PGN\Magnus Games PGN data\Magnus_PGN_filtered.csv'

# Export DataFrame to CSV
df.to_csv(csv_file_path, index=False)

# Model Building

In [51]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dense, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Convert Previous_FEN to numpy array
X_fen = np.array(df['Previous_FEN'].tolist())

# Normalize numeric features
scaler = StandardScaler()
X_numeric = df[['Move_Count', 'Time_Left_White', 'Time_Left_Black']].values
X_numeric = scaler.fit_transform(X_numeric)

# Combine inputs
input_fen = Input(shape=(8, 8, 12))
input_numeric = Input(shape=(3,))

# CNN layers for board state
conv1 = Conv2D(32, (3, 3), activation='relu')(input_fen)
conv2 = Conv2D(64, (3, 3), activation='relu')(conv1)
flatten = Flatten()(conv2)

# Concatenate CNN output with numeric inputs
concatenated = Concatenate()([flatten, input_numeric])

# Dense layers for regression
dense1 = Dense(128, activation='relu')(concatenated)
output = Dense(1)(dense1)  # Regression output for Time_Taken

# Define model
model = Model(inputs=[input_fen, input_numeric], outputs=output)

# Compile model with MSE as the metric
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse'])

# Train model
history = model.fit([X_train_fen, X_train_numeric], y_train, epochs=10, batch_size=32, 
                    validation_data=([X_test_fen, X_test_numeric], y_test))
# Train model
model.fit([X_train_fen, X_train_numeric], y_train, epochs=10, batch_size=32, validation_data=([X_test_fen, X_test_numeric], y_test))

Epoch 1/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - loss: 4.9581e-04 - mse: 4.9581e-04 - val_loss: 4.7556e-06 - val_mse: 4.7663e-06
Epoch 2/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 7ms/step - loss: 1.6181e-04 - mse: 1.6181e-04 - val_loss: 1.7121e-04 - val_mse: 1.7168e-04
Epoch 3/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 9.5898e-05 - mse: 9.5898e-05 - val_loss: 1.1816e-05 - val_mse: 1.1823e-05
Epoch 4/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - loss: 9.5093e-05 - mse: 9.5093e-05 - val_loss: 1.8673e-05 - val_mse: 1.8689e-05
Epoch 5/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - loss: 0.0028 - mse: 0.0028 - val_loss: 6.4059e-06 - val_mse: 6.4207e-06
Epoch 6/10
[1m1113/1113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - loss: 5.4635e-04 - mse: 5.4635e-04 - val_loss: 1.9917e-04 - val_

<keras.src.callbacks.history.History at 0x1b15f7d1250>

In [52]:
# Evaluate model on test data
loss, mse = model.evaluate([X_test_fen, X_test_numeric], y_test)
print(f'Test MSE: {mse}')


[1m279/279[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 4.1401e-06 - mse: 4.1402e-06
Test MSE: 4.786091267305892e-06
