# Naive Bayes Classifier for Chess Openings

### Code:

In [6]:
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 


def LoadData(num_to_exclude, truncate_ply, move_tokenizer):
    to_exclude = [i for i in range(1, num_to_exclude)]# specifies the amount of data to load in (leave list empty to load all data)
    games = pd.read_csv('games.csv', header=0, encoding='latin-1', skiprows=to_exclude)
    opening_cats = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
    labels = []
    for index, row in games.iterrows():
        labels.append(opening_cats[row['opening_eco'][0]])
    games = pd.concat([games, pd.DataFrame({'label': labels})], axis=1)
    headers = list(games.columns.values)

    X_train, X_test, y_train, y_test = train_test_split(games.to_numpy(), labels, test_size=0.33)
    X_train = pd.DataFrame(data=X_train, columns=headers)
    X_test = pd.DataFrame(data=X_test, columns=headers)

    # dictionary for how to tokenize moves into a list
    # by ply: split by  each move of white or black
    # by turn: split by each turn i.e. one white move and one black move
    # by turn with number: split by turn and add the number of the turn to the beginning of the string (psuedo-dependency)
    move_tokenizer_options = {'by ply': 0, 'by turn': 1, 'by turn with number': 2}

    games, mcw = processGames(X_train, truncate_ply[0], move_tokenizer_options[move_tokenizer], opening_cats)
    test, mcw_test = processGames(X_test, truncate_ply[1], move_tokenizer_options[move_tokenizer], opening_cats)
    return games, test, mcw

def processGames(games, truncate_ply, move_tokenizer, opening_cats):
    data = [games['moves'], games['opening_eco']]
    ply = games['opening_ply']
    headers = ['moves', 'opening']
    data = pd.concat(data, axis=1, keys=headers)
    maxPly = 14

    for index, row in data.iterrows():
        row['opening'] = opening_cats[row['opening'][0]]
        ply[index] = ply[index]+1 if (ply[index] % 2 != 0) else ply[index]
        moveCount = 0
        moves = word_tokenize(row['moves'])

        if (move_tokenizer == 0):
            if(truncate_ply):
                row['moves'] = moves[0:ply[index]+1]
            else:
                row['moves'] = moves[0:maxPly+1]
        else:
            formattedMoves = []
            for move in moves:
                if(truncate_ply): 
                    if (moveCount >= (ply[index])):
                        break
                elif (moveCount >= maxPly):
                    break

                if(move_tokenizer == 1):
                    if(moveCount%2==0):
                        formattedMoves.append(str(move))
                    else:
                        formattedMoves[int(moveCount/2)] += ' ' + str(move)
                if(move_tokenizer == 2):
                    if(moveCount%2==0):
                        formattedMoves.append(str(int(moveCount/2)+1) + '.' + str(move))
                    else:
                        formattedMoves[int(moveCount/2)] += ' ' + str(move)	

                moveCount += 1
            row['moves'] = formattedMoves

    mcw = []
    for key in opening_cats:
        rows = data.loc[data['opening'] == opening_cats[key]]
        # print(key, rows)
        moves = []
        for index, row in rows.iterrows():
            moves += row['moves']
        mcw.append(MostCommonWords(moves))
    return data, mcw


# This function calculates the requency of words using NLTK
# Input: data in string format
# Output: data_dist is a data dictionary like NLTK object
def MostCommonWords(data):
    data_dist = FreqDist(data)
    return data_dist

def Prob_Word_GivenY(word, train_data, numWords, alpha, y):
    sum = 0
    count_y = 0
    for i, row in train_data.iterrows():
        if(row['feature_list'].get(word)):
            if(row['opening']==y and row['feature_list'].get(word)>0):
                sum += 1
                count_y += 1
    return (sum + alpha) / (count_y + numWords*alpha)

def Classify2(moves, p_category, train_splits, numWords, alpha, categories):
    p_cat_given_moves = [x for x in p_category]

    for move in moves:
        for key, value in categories.items():
            p_cat_given_moves[value] *= Prob_Word_GivenY(move, train_splits[value], numWords, alpha, value)
    return p_cat_given_moves.index(max(p_cat_given_moves))

def Training2(train_data, train_wc, categories, test_data):
    dictionary = set()
    for frqdist in train_wc:
        dictionary = dictionary.union(set(frqdist.keys()))

    m = [len(x[1]) for x in train_data.groupby('opening')]

    alpha = 1

    p = [(m_cat + 1) / (sum(m) + len(categories)*alpha) for m_cat in m]

    num_words = [len(frqdist) for frqdist in train_wc]

    train_data['feature_list'] = ""
    for i, row in train_data.iterrows():
        word_map = {}
        for word in dictionary:
            word_map[word] = row['moves'].count(word)
        row['feature_list'] = word_map

    train_splits = [x[1] for x in train_data.groupby('opening')]

    correct = 0
    shape = np.zeros(shape=(len(categories), len(categories)))
    conf_matrix = pd.DataFrame(shape)
    for i, row in test_data.iterrows():
        prediction = Classify2(row['moves'], p, train_splits, sum(num_words), alpha, categories)
        conf_matrix.iat[prediction, row['opening']] += 1
        correct += 1 if prediction == row['opening'] else 0
        # print(prediction, row['opening'])
    print('ACCURACY: ', correct/len(test_data))
    print(conf_matrix)


# def main():
#     opening_cats = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
#     games, test,  mcw = LoadData()
#     Training2(games, mcw, opening_cats, test)

# main()

### Tests:

In [9]:
opening_cats = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
games, test, mcw = LoadData(19500, [True, False], 'by turn with number')
Training2(games, mcw, opening_cats, test)

ACCURACY:  0.8648648648648649
      0     1     2     3     4
0  28.0   0.0   0.0   1.0   2.0
1   2.0  44.0   0.0   1.0   1.0
2   6.0   6.0  62.0   4.0   0.0
3   0.0   0.0   0.0  14.0   0.0
4   1.0   1.0   0.0   0.0  12.0
