# RNN Test

### Preprocess

Setting up data to be used in RNN (turning moves into list of numbers from dictionary mapping)

In [19]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split 

def MostCommonWords(data):
    data_dist = FreqDist(data)
    return data_dist

def processGames(games, truncate_ply, opening_cats):
    data = [games['moves'], games['opening_eco']]
    ply = games['opening_ply']
    headers = ['moves', 'opening']
    data = pd.concat(data, axis=1, keys=headers)
    maxPly = 14

    for index, row in data.iterrows():
        row['opening'] = opening_cats[row['opening'][0]]
        ply[index] = ply[index]+1 if (ply[index] % 2 != 0) else ply[index]
        moveCount = 0
        moves = word_tokenize(row['moves'])
        formattedMoves = []
        for move in moves:
            if(truncate_ply): 
                if (moveCount >= (ply[index])):
                    break
            elif (moveCount >= maxPly):
                break
            if(moveCount%2==0):
                # uncomment this top line to use moveCount number in the moves list i.e. 1. d4 e5, 2. ...
                # formattedMoves.append(str(int(moveCount/2)+1) + '.' + str(move))
                formattedMoves.append(str(move))
            else:
                formattedMoves[int(moveCount/2)] += ' ' + str(move)
            moveCount += 1
        row['moves'] = formattedMoves
        # use this instead of the formattedMoves list to seperate moves by single ply
#         row['moves'] = moves

    mcw = []
    for key in opening_cats:
        rows = data.loc[data['opening'] == opening_cats[key]]
        # print(key, rows)
        moves = []
        for index, row in rows.iterrows():
            moves += row['moves']
        mcw.append(MostCommonWords(moves))
    return data, mcw

to_exclude = [i for i in range(2, 19800)]# specifies the amount of data to load in (leave list empty to load all data)
games = pd.read_csv('games.csv', header=0, encoding='latin-1', skiprows=to_exclude)
opening_cats = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
labels = []
for index, row in games.iterrows():
    labels.append(opening_cats[row['opening_eco'][0]])
games = pd.concat([games, pd.DataFrame({'label': labels})], axis=1)
headers = list(games.columns.values)

X_train, X_test, y_train, y_test = train_test_split(games.to_numpy(), labels, test_size=0.33)
X_train = pd.DataFrame(data=X_train, columns=headers)
X_test = pd.DataFrame(data=X_test, columns=headers)

games, mcw = processGames(X_train, True, opening_cats)
test, mcw_test = processGames(X_test, False, opening_cats)

dictionary = set()
for frqdist in mcw:
    dictionary = dictionary.union(set(frqdist.keys()))
    
move2id = {}
moveCounter = 1
for move in dictionary:
    move2id[move] = moveCounter
    moveCounter += 1

for index, row in games.iterrows():
    move_ids = []
    for move in row['moves']:
        move_ids.append(move2id[move])
    row['moves'] = move_ids

print(games.iloc[0]['moves'])

[113, 78, 132, 3, 76, 70, 66, 142]
