## Getting game history dataset and creating the progressive move df

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/games.csv")
df.head(2)

Unnamed: 0,id,rated,created_at,last_move_at,turns,victory_status,winner,increment_code,white_id,white_rating,black_id,black_rating,moves,opening_eco,opening_name,opening_ply
0,TZJHLljE,False,1504210000000.0,1504210000000.0,13,outoftime,white,15+2,bourgris,1500,a-00,1191,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,D10,Slav Defense: Exchange Variation,5
1,l1NXvwaE,True,1504130000000.0,1504130000000.0,16,resign,black,5+10,a-00,1322,skinnerua,1261,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,B00,Nimzowitsch Defense: Kennedy Variation,4


In [3]:
df.shape

(20058, 16)

In [4]:
df.columns

Index(['id', 'rated', 'created_at', 'last_move_at', 'turns', 'victory_status',
       'winner', 'increment_code', 'white_id', 'white_rating', 'black_id',
       'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply'],
      dtype='object')

In [5]:
df = df[["moves", "opening_name"]]
df.head()

Unnamed: 0,moves,opening_name
0,d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5...,Slav Defense: Exchange Variation
1,d4 Nc6 e4 e5 f4 f6 dxe5 fxe5 fxe5 Nxe5 Qd4 Nc6...,Nimzowitsch Defense: Kennedy Variation
2,e4 e5 d3 d6 Be3 c6 Be2 b5 Nd2 a5 a4 c5 axb5 Nc...,King's Pawn Game: Leonardis Variation
3,d4 d5 Nf3 Bf5 Nc3 Nf6 Bf4 Ng4 e3 Nc6 Be2 Qd7 O...,Queen's Pawn Game: Zukertort Variation
4,e4 e5 Nf3 d6 d4 Nc6 d5 Nb4 a3 Na6 Nc3 Be7 b4 N...,Philidor Defense


In [6]:
df["progressive_moves"] = df["moves"].str.split().apply(lambda moves: [' '.join(moves[:i+1]) for i in range(len(moves))]) 
# .join(moves[:i+1]) is exclusive

In [7]:
df.iloc[0]["progressive_moves"]

['d4',
 'd4 d5',
 'd4 d5 c4',
 'd4 d5 c4 c6',
 'd4 d5 c4 c6 cxd5',
 'd4 d5 c4 c6 cxd5 e6',
 'd4 d5 c4 c6 cxd5 e6 dxe6',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5',
 'd4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4']

# we will test with 5000 entries only first

In [8]:
target = df.explode("progressive_moves").iloc[:5000].reset_index(drop = True)

In [9]:
target.shape

(5000, 3)

In [10]:
target[["prev_moves", "last_move"]] = target["progressive_moves"].str.rsplit(n=1, expand=True)
target.drop(["progressive_moves", "moves"], axis = 1, inplace=True)
target.head()

Unnamed: 0,opening_name,prev_moves,last_move
0,Slav Defense: Exchange Variation,d4,
1,Slav Defense: Exchange Variation,d4,d5
2,Slav Defense: Exchange Variation,d4 d5,c4
3,Slav Defense: Exchange Variation,d4 d5 c4,c6
4,Slav Defense: Exchange Variation,d4 d5 c4 c6,cxd5


In [11]:
target.shape

(5000, 3)

In [12]:
def fix_prev_last_move(row):
    if row["last_move"] == None:
        return pd.Series(["None", row["prev_moves"]])
    else:
        return pd.Series([row["prev_moves"], row["last_move"]])

In [13]:
target[["prev_moves", "last_move"]] = target.apply(lambda row : fix_prev_last_move(row), axis = 1)
target.head()

Unnamed: 0,opening_name,prev_moves,last_move
0,Slav Defense: Exchange Variation,,d4
1,Slav Defense: Exchange Variation,d4,d5
2,Slav Defense: Exchange Variation,d4 d5,c4
3,Slav Defense: Exchange Variation,d4 d5 c4,c6
4,Slav Defense: Exchange Variation,d4 d5 c4 c6,cxd5


In [14]:
def split_dataframe(df : pd.DataFrame, num_files : int, rows_per_file : int):

  last_file_rows = df.shape[0] % rows_per_file

  for i in range(num_files):
    start_row = i * rows_per_file
    end_row = min((i + 1) * rows_per_file, df.shape[0])

    # handle the last file potentially having fewer rows, won't cause trouble for us
    if i == num_files - 1:
      end_row = start_row + last_file_rows

    subset_df = df.iloc[start_row:end_row]

    filename = f"./data/data_{i:02d}.csv"

    subset_df.to_csv(filename, index=False)


In [15]:
split_dataframe(target, num_files=20, rows_per_file=500)