# RNN Test

### Preprocess

Setting up data to be used in RNN (turning moves into list of numbers from dictionary mapping)

In [12]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split 

def MostCommonWords(data):
	data_dist = FreqDist(data)
	return data_dist

def processGames(games, truncate_ply, opening_cats):
    data = [games['moves'], games['opening_eco']]
    ply = games['opening_ply']
    headers = ['moves', 'opening']
    data = pd.concat(data, axis=1, keys=headers)
    maxPly = 14

    for index, row in data.iterrows():
        row['opening'] = opening_cats[row['opening'][0]]
        ply[index] = ply[index]+1 if (ply[index] % 2 != 0) else ply[index]
        moveCount = 0
        moves = word_tokenize(row['moves'])
        formattedMoves = []
        for move in moves:
            if(truncate_ply): 
                if (moveCount >= (ply[index])):
                    break
            elif (moveCount >= maxPly):
                break
            if(moveCount%2==0):
                # uncomment this top line to use moveCount number in the moves list i.e. 1. d4 e5, 2. ...
                # formattedMoves.append(str(int(moveCount/2)+1) + '.' + str(move))
                formattedMoves.append(str(move))
            else:
                formattedMoves[int(moveCount/2)] += ' ' + str(move)
            moveCount += 1
        row['moves'] = formattedMoves
#         row['moves'] = moves

    mcw = []
    for key in opening_cats:
        rows = data.loc[data['opening'] == opening_cats[key]]
        # print(key, rows)
        moves = []
        for index, row in rows.iterrows():
            moves += row['moves']
        mcw.append(MostCommonWords(moves))
    return data, mcw

to_exclude = [i for i in range(2, 19800)]# specifies the amount of data to load in (leave list empty to load all data)
games = pd.read_csv('games.csv', header=0, encoding='latin-1', skiprows=to_exclude)
opening_cats = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
labels = []
for index, row in games.iterrows():
    labels.append(opening_cats[row['opening_eco'][0]])
games = pd.concat([games, pd.DataFrame({'label': labels})], axis=1)
headers = list(games.columns.values)

X_train, X_test, y_train, y_test = train_test_split(games.to_numpy(), labels, test_size=0.33)
X_train = pd.DataFrame(data=X_train, columns=headers)
X_test = pd.DataFrame(data=X_test, columns=headers)

games, mcw = processGames(X_train, True, opening_cats)
test, mcw_test = processGames(X_test, False, opening_cats)

dictionary = set()
for frqdist in mcw:
    dictionary = dictionary.union(set(frqdist.keys()))
    
move2id = dict()
moveCounter = 1
for move in dictionary:
    move2id[move] = moveCounter
    moveCounter += 1
print(move2id)

{'Kxc7': 1, 'Re4': 2, 'Ra2': 3, 'fxg4': 4, 'Ra7+': 5, 'd2': 6, 'Qb7+': 7, 'f5+': 8, 'Rexa6': 9, 'Rh7': 10, 'Ra3': 11, 'Qxg2+': 12, 'g1=Q+': 13, 'Rcd5': 14, 'O-O-O': 15, 'Nxd1': 16, 'Bc2+': 17, 'Kf7': 18, 'g3+': 19, 'Bxd3+': 20, 'Ng6+': 21, 'Qe5': 22, 'Qxc8': 23, 'Kg5': 24, 'Be2': 25, 'Qxb6': 26, 'Ngxe5': 27, 'Rag8': 28, 'Bxf7': 29, 'Qc5': 30, 'Nf6': 31, 'Kxh6': 32, 'cxd6': 33, 'Qh7': 34, 'Nxc6': 35, 'Kh1': 36, 'Nxd6': 37, 'dxe4': 38, 'Rh6': 39, 'Nxg2+': 40, 'Rfe8': 41, 'Qxe7+': 42, 'Qh6+': 43, 'g6': 44, 'Qxf3': 45, 'O-O-O+': 46, 'Bxd1': 47, 'Kb6': 48, 'Nxd2': 49, 'Nf2': 50, 'Rxg8+': 51, 'Qxe2': 52, 'Rxf3+': 53, 'Rgxg2': 54, 'Rhe8': 55, 'Ba4+': 56, 'Qxg1': 57, 'exd4': 58, 'Nxg4': 59, 'Be4': 60, 'Bxd4': 61, 'Bd4+': 62, 'Qf8': 63, 'Ncd5': 64, 'Qxa8+': 65, 'Bxe7': 66, 'R8h3+': 67, 'Rb8+': 68, 'Rcd1': 69, 'Qxa1': 70, 'Bc3+': 71, 'Qc7+': 72, 'Rxa1': 73, 'Rxa5': 74, 'exf4': 75, 'g4': 76, 'Nf3': 77, 'Rxd8+': 78, 'Bg6': 79, 'Kd8': 80, 'Qxe3': 81, 'Rfb8': 82, 'Ndf6': 83, 'Kxg7': 84, 'Qc3': 85, '