In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import category_encoders as ce

### This file acts like a hub. it has all the functions needed over multiple models and files.    
stuff like data loading, cleaning and feature engineering will be done here, then imported over multiple files.


In [2]:
def load_data():
    data = pd.read_csv(r"C:\Users\lopke\Downloads\games.csv")
    display(data.head())  # Display only the first few rows
    return data

data = load_data()


Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4
2,mIICvQHh,True,1504130000000.0,1504130000000.0,61,mate,white,5+10,ischia,1496,a-00,1500,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,C20,King's Pawn Game: Leonardis Variation,3
3,kWKvrqYL,True,1504110000000.0,1504110000000.0,61,mate,white,20+0,daniamurashov,1439,adivanov2009,1454,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,D02,Queen's Pawn Game: Zukertort Variation,3
4,9tXo1AUZ,True,1504030000000.0,1504030000000.0,95,mate,white,30+3,nik221107,1523,adivanov2009,1469,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,C41,Philidor Defense,5


In [3]:
#a preposessing function to make it numeric and ready for a model is needed.
#no missing values, so no need to handle that.
class preprocess_data:
    def __init__(self, data):
        self.data = data
        
    @staticmethod
    def parse_time_control(tc):
        try:
            base, incr = tc.split('+')
            return int(base), int(incr)
        except:
            return None, None

    def make_num(self, feature_engineering=False):
        """
        This function processes the data to make it suitable for machine learning models.
        It converts categorical variables to numerical format, extracts time control features,
        and optionally adds feature engineering steps.
        Parameters:
        feature_engineering (bool): If True, additional features like rating difference and game duration will be added.
        
        Returns:
        pd.DataFrame: Processed DataFrame with numerical features.
        """
        processed_data_num = self.data.copy()
        processed_data_num.drop(columns=['id', 'white_id', 'black_id', 'moves', 'opening_eco'], inplace=True)
        processed_data_num["win_num"] = processed_data_num["winner"].map({"white": 1, "black": 0, "draw": 0.5})
        processed_data_num["rated_num"] = processed_data_num["rated"].map({True: 1, False: 0})
        processed_data_num['victory_status'] = processed_data_num['winner'].map({"resign": 1, "mate": 2, "outoftime": 3, "draw": 0, "unknown": 0})
        processed_data_num.drop(columns=['winner', 'rated', "victory_status"], inplace=True)

        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        processed_data_num[['base_time', 'increment']] = processed_data_num['increment_code'].apply(
            lambda x: pd.Series(preprocess_data.parse_time_control(x))
        )
        processed_data_num.drop(columns=['opening_name', "increment_code"], inplace=True)
        # the choice to have feature engineering!
        if feature_engineering:
            processed_data_num['rating_diff'] = processed_data_num['white_rating'] - processed_data_num['black_rating']

            # Try converting without unit, fallback if needed
            try:
                processed_data_num['created_at'] = pd.to_datetime(processed_data_num['created_at'], unit='ms')
                processed_data_num['last_move_at'] = pd.to_datetime(processed_data_num['last_move_at'], unit='ms')
            except Exception:
                processed_data_num['created_at'] = pd.to_datetime(processed_data_num['created_at'])
                processed_data_num['last_move_at'] = pd.to_datetime(processed_data_num['last_move_at'])


            # Calculate game duration in seconds
            processed_data_num['game_duration_seconds'] = (
            processed_data_num['last_move_at'] - processed_data_num['created_at']
            ).dt.total_seconds().fillna(0)

            # Avoid division by zero for turns
            processed_data_num['avg_time_per_turn'] = (
            processed_data_num['game_duration_seconds'] / processed_data_num['turns'].replace(0, 1)
            ).replace([float('inf'), -float('inf')], 0).fillna(0)
            processed_data_num.drop(columns=['created_at', 'last_move_at'], inplace=True)

        return processed_data_num
    def removing_the_time(self):
        """
        This function removes the time-related columns from the DataFrame.
        for dumb models, the time  columns might cause uneeded noice due to all the 0s. (due to wrong time parsing)
        
        Returns:
        pd.DataFrame: DataFrame with time-related columns removed.
        
        """
        processed_data_num = self.data.copy()
        processed_data_num.drop(columns=['created_at', 'last_move_at', 'game_duration_seconds', 'avg_time_per_turn'], inplace=True)
        return processed_data_num

#test code  
data1 = preprocess_data(data).make_num(feature_engineering=True)
data1.head()
    

Unnamed: 0,turns,white_rating,black_rating,opening_ply,win_num,rated_num,base_time,increment,rating_diff,game_duration_seconds,avg_time_per_turn
0,13,1500,1191,5,1.0,0,15,2,309,0.0,0.0
1,16,1322,1261,4,0.0,1,5,10,61,0.0,0.0
2,61,1496,1500,3,1.0,1,5,10,-4,0.0,0.0
3,61,1439,1454,3,1.0,1,20,0,-15,0.0,0.0
4,95,1523,1469,5,1.0,1,30,3,54,0.0,0.0


In [None]:
data1['avg_time_per_turn'].value_counts()

avg_time_per_turn
0.000000      8548
156.250000      16
161.290323      14
178.571429      14
185.185185      12
              ... 
12.517156        1
30.053528        1
87.320526        1
9.387690         1
7.002222         1
Name: count, Length: 10506, dtype: int64

In [24]:
# lets make a func to make the moves numeric, from left to right, and then from down to up.
import re
import pandas as pd

# def test_run(data):
#     # Create mapping a1=1, b1=2, ..., h8=64
#     square_map = {}
#     counter = 1
#     for rank in range(1, 9):
#         for file in "abcdefgh":
#             square_map[f"{file}{rank}"] = counter
#             counter += 1

#     # Debug: print the mapping so you can check it
#     print("Square mapping:")
#     for square, num in square_map.items():
#         print(f"{square}: {num}")
    
#     def convert_moves(row):
#         # If the row isn't a string (e.g., NaN), return empty list
#         if not isinstance(row, str):
#             return []
#         # Extract all board squares like e4, h7
#         squares = re.findall(r'[a-h][1-8]', row)
#         # Convert each to its numeric code
#         return [square_map[sq] for sq in squares]
    
#     return data.apply(convert_moves)

# #data1['moves_num'] = test_run(data1['moves'])
# # works!

In [23]:
# now without the print statement.
def making_moves_num(data):
    """
    Convert a Series of chess move strings into numeric sequences based on board coordinates.

    Each square on the chessboard is assigned a number from 1 to 64:
        a1 = 1, b1 = 2, ..., h1 = 8,
        a2 = 9, ..., h8 = 64.
    Numbering proceeds left-to-right (files a–h) across each rank,
    starting from rank 1 (bottom) to rank 8 (top).

    Moves are extracted from the input strings by finding all coordinate
    patterns like 'e4' or 'h7'. Piece letters (N, B, Q, K, R), capture markers ('x'),
    checks ('+'), mate ('#'), and other PGN annotations are ignored. Castling moves
    (O-O, O-O-O) are not encoded by default, as they contain no coordinates.

    Parameters
    ----------
    data : pandas.Series
        A Series where each element is a string containing the sequence of moves for a chess game
        (in standard algebraic notation or similar).

    Returns
    -------
    pandas.Series
        A Series of lists, where each list contains the numeric representation of
        all squares mentioned in the corresponding game's move list, in order."""
    square_map = {}
    counter = 1
    for rank in range(1, 9):
        for file in "abcdefgh":
            square_map[f"{file}{rank}"] = counter
            counter += 1
    
    def convert_moves(row):
        if not isinstance(row, str):
            return []
        squares = re.findall(r'[a-h][1-8]', row)
        return [square_map[sq] for sq in squares]
    
    return data.apply(convert_moves)

data1['moves_num'] = making_moves_num(data1['moves'])
