### Step 1: Data Processing Functions
To properly import and process the data from our csv file we will need functions to import the data, extract the relevant information, convert the information into the desired labels, and numericize the text into data the model can interpret.

In [103]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

#returns a pandas dataframe with the data from a specified csv file
def loadData(filename):
    data = pd.read_csv(filename)
    return data

#returns one of 13 chess opening subtypes given a specific opening
def openType(openStr):
    open = openStr[0]
    num = int(openStr[1:])
    if (openStr[0] == 'A'):
        if (num < 40):
            open += '00'
        elif (num < 45):
            open += '40'
        elif (num < 50):
            open += '45'
        elif (num < 80):
            open += '50'
        else:
            open += '80'
    if (openStr[0] == 'B'):
        if (num < 20):
            open += '00'
        else:
            open += '20'
    if (openStr[0] == 'C'):
        if (num < 20):
            open += '00'
        else:
            open += '20'
    if (openStr[0] == 'D'):
        if (num < 70):
            open += '00'
        else:
            open += '70'
    if (openStr[0] == 'E'):
        if (num < 60):
            open += '00'
        else:
            open += '60'
    return open

#returns a dataset and labelset from a pandas dataframe
#the returned dataset includes the first (openingPly) moves preformed
#the returned labelset is the opening subtype defined by the moves preformed in the dataset
def dataExtraction(baseData):
    data = []
    labels = []
    
    for i in range(len(baseData.index)):
        row = baseData.iloc[i, :]
        opening = []
        moveText = row[12].split()
        for j in range(row[15]):
            opening.append(moveText[j])
        data.append(opening)
        labels.append(openType(row[13]))
    
    return data, labels

#returns a numericized version of the input array along with the dictionary used for conversion
def numericize(textData):
    numericDic = {}
    conversion = []
    if type(textData[0]) != list:
        for label in textData:
            if label not in numericDic:
                numericDic[label] = len(numericDic)
            conversion.append(numericDic[label])
    else:
        for text in textData:
            moveList = []
            for move in text:
                if move not in numericDic:
                    numericDic[move] = len(numericDic) + 1 #avoids the 0 index to avoid value overlap with padding
                moveList.append(numericDic[move])
            conversion.append(moveList)
            
    return numericDic, conversion

#outputs a notification to the user that jupyter has finished loading the code block
print("Definitions and imports have finished loading.")

Definitions and imports have finished loading.


### Step 2: Preprocessing the Data
Using the functions defined above, data will be preprocessed into its desired format. Aside from the methods already mentioned, data is padded to match the length of the longest opening ply and divided into training and testing subsets at an 80-20 split.

In [104]:
#imports preprocessing libraries
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split

#implements the methods defined in the previous block to numericize the games.csv data
data = loadData("games.csv")
maxMoves = max(data.iloc[:,15])   #stores the maximum opening ply value for padding
moves, labels = dataExtraction(data)
moveDic, numMoves = numericize(moves)
labelDic, numLabels = numericize(labels)
numLabels = np.array(numLabels)   #reformats the label data into a numpy array for consistent data structures

#pads the input data with 0's to ensure all input data is the same size
numMoves = sequence.pad_sequences(numMoves, maxlen=maxMoves)

#builds subsets for testing and training data at an 80-20 split
X_train, X_test, y_train, y_test = train_test_split(numMoves, numLabels, test_size=0.2, stratify=numLabels)

#outputs a notification to the user that jupyter has finished loading the code block
print("Data has been loaded and preprocessed.")

Data has been loaded and preprocessed.


### Step 3: Creating and compiling the RNN model
Here we build the recurrent neural network model using our imported tools, compile the model, and train it on our data.

In [105]:
#imports the necessary libraries for model creation
from keras import Sequential
from keras.utils import to_categorical
from keras.layers import Embedding, LSTM, Dense, Dropout

#converts our numeric label data into a binary classification matrix
y_train_bin = to_categorical(y_train)
y_test_bin = to_categorical(y_test)

#builds our model layer by layer
embedding_size=300     #it is challenging to lock down a perfect value, research says 100-300 is standard
model = Sequential()   #initializes the model
#converts moves to dense vectors based on surrounding moves
model.add(Embedding(len(moveDic)+1, embedding_size, input_length=maxMoves))
model.add(LSTM(100))   #adds long short-term memory layer with an output shape of 100
model.add(Dense(13, activation='softmax'))   #condenses estimation to 13 labels and uses 'softmax' to select the highest value

#compiles the model using categorical crossentropy for multi-label classification
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#trains the model over the specified number of epochs
batch_size = 64 #potentially trivial
num_epochs = 5
model.fit(X_train, y_train_bin, batch_size=batch_size, epochs=num_epochs, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x14f7971b850>

### Step 4: Scoring the model
Note - multiple trials may require require rerunning the program from step 2.

In [106]:
scores = model.evaluate(X_test, y_test_bin, verbose=0)
print('Test accuracy:', scores[1])

Test accuracy: 0.9947656989097595
