# Bi-LSTM Grade Classifier

In this notebook we will train a Bi-LSTM to classify moonboard problems by grade. We also experiment with a Word2Vec based hold embedding.

In [18]:
import json
import pandas as pd
import numpy as np
import re
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import datetime
from sklearn.model_selection import train_test_split

In [3]:
# Read data from file:
with open('problems.json', 'r') as fp:
    problems_dict = json.load(fp)

In [8]:
# Process data
df = pd.DataFrame.from_dict(problems_dict, orient = 'index')
df.isna().sum()

problem_name     0
info             0
url              0
moves           22
dtype: int64

In [9]:
df = df.dropna()

In [13]:
def hold_index(d):
    # convert a move to the coorindates of the hold on the board
    s_split = re.split('(\d+)', d['Description'], maxsplit=1)
    # extra `-1` in both for 0 indexing
    w = ord(s_split[0].upper()) - 64 - 1
    h = int(s_split[1]) - 1
    
    #return 11*18-(11*h+w)
    return 11*h+w


def get_sequence(moves):
    seq = []
    
    for move in moves:
        seq.append(hold_index(move))
    return seq


df['Move sequence'] = df['moves'].apply(get_sequence)
problems = list(df['Move sequence'])
len(problems)

14902

In [16]:
# Process labels
grades = []
for problem in df['info']:
    grades.append(problem[2])
    
grade_map = {
        '5+':0,
        '6A': 1,
        '6A+': 2,
        '6B': 3,
        '6B+': 4,
        '6C': 5,
        '6C+': 6,
        '7A': 7,
        '7A+': 8,
        '7B': 9,
        '7B+': 10,
        '7C': 11,
        '7C+': 12,
        '8A': 13,
        '8A+': 14,
        '8B': 15,
        '8B+': 16,
        '8C': 17,
        '8C+': 18
    }

grades = [grade.split()[0] for grade in grades]
grades = [grade_map[grade] for grade in grades]
len(grades)

14902

In [19]:
X_train, X_test, y_train, y_test = train_test_split(problems, grades, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

## Define Model

In [24]:
torch.manual_seed(1)


class BiLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        #self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim//2, 19)

        

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, (h_n, c_n) = self.lstm(embeds, self.hidden)
        
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        
        fbhn = (h_n[-2,:,:]+h_n[-1,:,:])
        
        lstm_feats = self.hidden2tag(fbhn)
        return lstm_feats


    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        
        return lstm_feats

## Training

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(11*18, 20, 50).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [33]:
"""Each epoch will take about 1-2 minutes"""

import datetime
criterion = nn.CrossEntropyLoss()
for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0
    train_correct = 0
    model.train()
    for i, idxs in enumerate(X_train):
        grade = y_train[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of problem hold indices.
        problem_in = torch.tensor(idxs, dtype=torch.long).to(device)
        grade = torch.tensor([grade], dtype=torch.long).to(device)
        
        p = model(problem_in)
        _, pred = torch.max(p, 1)
        if pred == grade:
            train_correct +=1
        
        

        # Step 3. Run our forward pass.
        loss = criterion(p, grade)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()
    model.eval()
    val_loss = 0
    val_correct = 0
    for i, idxs in enumerate(X_val):
        grade = y_val[i]
        problem_in = torch.tensor(idxs, dtype=torch.long).to(device)
        grade = torch.tensor([grade], dtype=torch.long).to(device)
        p = model(problem_in)
        _, pred = torch.max(p, 1)
        if pred == grade:
            val_correct +=1
        loss = criterion(p, grade)
        val_loss+=loss.item()
    
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, Training acc: %.2f, val loss: %.2f, val acc: %.2f, time: %.2fs" %(epoch+1, train_loss, train_correct/len(X_train), val_loss, val_correct/len(X_val), (time2-time1).total_seconds()))


Epoch:1, Training loss: 19784.21, Training acc: 0.26, val loss: 6200.07, val acc: 0.27, time: 47.90s
Epoch:2, Training loss: 18012.17, Training acc: 0.29, val loss: 5950.08, val acc: 0.29, time: 48.24s
Epoch:3, Training loss: 17387.26, Training acc: 0.31, val loss: 5861.43, val acc: 0.29, time: 47.99s
Epoch:4, Training loss: 17045.92, Training acc: 0.32, val loss: 5818.35, val acc: 0.30, time: 48.19s
Epoch:5, Training loss: 16802.72, Training acc: 0.32, val loss: 5793.82, val acc: 0.30, time: 48.03s
Epoch:6, Training loss: 16632.71, Training acc: 0.33, val loss: 5796.31, val acc: 0.30, time: 49.86s
Epoch:7, Training loss: 16465.19, Training acc: 0.33, val loss: 5789.23, val acc: 0.30, time: 48.09s
Epoch:8, Training loss: 16302.94, Training acc: 0.34, val loss: 5797.99, val acc: 0.30, time: 48.14s
Epoch:9, Training loss: 16149.53, Training acc: 0.35, val loss: 5807.84, val acc: 0.30, time: 48.43s
Epoch:10, Training loss: 16014.88, Training acc: 0.35, val loss: 5801.75, val acc: 0.31, ti

# Hold Embedding

We train a word2vec model to obtain a vector representation of each hold on the Moonboard. We regard each problem as a sentence where each word in the sentence is a hold on the moonboard. 

In [36]:
problem_string = []
for seq in problems:
    string_seq = [str(hold) for hold in seq]
    problem_string.append(string_seq)

In [37]:
# We will use a gensim word2vec model


from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec



# Callback class used for reporting loss during training process
class epochLoss(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 1

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 1:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss



In [38]:
#train word embedding model
model = Word2Vec(sentences=problem_string, size=20, window=2, min_count=5, workers=4, sg=1, iter = 50, compute_loss = True, callbacks=[epochLoss()])

Loss after epoch 1: 102076.3203125
Loss after epoch 2: 89125.1953125
Loss after epoch 3: 88068.359375
Loss after epoch 4: 87429.28125
Loss after epoch 5: 86599.25
Loss after epoch 6: 87019.65625
Loss after epoch 7: 86874.4375
Loss after epoch 8: 116645.0625
Loss after epoch 9: 87252.1875
Loss after epoch 10: 87123.875
Loss after epoch 11: 87192.0
Loss after epoch 12: 87564.25
Loss after epoch 13: 87374.75
Loss after epoch 14: 88479.75
Loss after epoch 15: 87544.875
Loss after epoch 16: 87516.75
Loss after epoch 17: 87109.125
Loss after epoch 18: 87184.125
Loss after epoch 19: 87008.75
Loss after epoch 20: 116666.625
Loss after epoch 21: 86903.625
Loss after epoch 22: 86375.625
Loss after epoch 23: 86655.75
Loss after epoch 24: 84640.625
Loss after epoch 25: 84359.75
Loss after epoch 26: 85577.5
Loss after epoch 27: 84979.75
Loss after epoch 28: 84451.25
Loss after epoch 29: 84311.25
Loss after epoch 30: 84656.75
Loss after epoch 31: 84381.0
Loss after epoch 32: 83267.25
Loss after epoc

In [44]:
holds = np.arange(11*18)
holds_list = [str(hold) for hold in holds]


In [40]:

EMBEDDING_DIM = 20

embedding_matrix = []
for hold in holds_list:
    try:
        embedding_matrix.append(model.wv[hold])
    except:
        embedding_matrix.append([0]*EMBEDDING_DIM)
embedding_matrix = np.array(embedding_matrix)
embedding_matrix.shape

(198, 20)

# Use hold embedding as initial weights for nn.Embedding

In [41]:
torch.manual_seed(1)


class BiLSTM(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        
        
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)

        """Here we use the embedding matrix as the initial weights of nn.Embedding"""
        self.word_embeds.weight.data.copy_(torch.from_numpy(embedding_matrix))
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim//2, 19)

        

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2).to(device),
                torch.randn(2, 1, self.hidden_dim // 2).to(device))

    

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, (h_n, c_n) = self.lstm(embeds, self.hidden)
        
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        
        fbhn = (h_n[-2,:,:]+h_n[-1,:,:])
        
        lstm_feats = self.hidden2tag(fbhn)
        return lstm_feats


    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        
        return lstm_feats

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTM(11*18, 20, 50).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [43]:
"""Each epoch will take about 1-2 minutes"""

import datetime
criterion = nn.CrossEntropyLoss()
for epoch in range(20):  
    time1 = datetime.datetime.now()
    train_loss = 0
    train_correct = 0
    model.train()
    for i, idxs in enumerate(X_train):
        grade = y_train[i]

        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of problem hold indices.
        problem_in = torch.tensor(idxs, dtype=torch.long).to(device)
        grade = torch.tensor([grade], dtype=torch.long).to(device)
        
        p = model(problem_in)
        _, pred = torch.max(p, 1)
        if pred == grade:
            train_correct +=1
        
        

        # Step 3. Run our forward pass.
        loss = criterion(p, grade)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()

        train_loss+=loss.item()
    model.eval()
    val_loss = 0
    val_correct = 0
    for i, idxs in enumerate(X_val):
        grade = y_val[i]
        problem_in = torch.tensor(idxs, dtype=torch.long).to(device)
        grade = torch.tensor([grade], dtype=torch.long).to(device)
        p = model(problem_in)
        _, pred = torch.max(p, 1)
        if pred == grade:
            val_correct +=1
        loss = criterion(p, grade)
        val_loss+=loss.item()
    
    time2 = datetime.datetime.now()

    print("Epoch:%d, Training loss: %.2f, Training acc: %.2f, val loss: %.2f, val acc: %.2f, time: %.2fs" %(epoch+1, train_loss, train_correct/len(X_train), val_loss, val_correct/len(X_val), (time2-time1).total_seconds()))


Epoch:1, Training loss: 18888.57, Training acc: 0.29, val loss: 5879.84, val acc: 0.29, time: 48.39s
Epoch:2, Training loss: 17425.93, Training acc: 0.31, val loss: 5753.45, val acc: 0.30, time: 48.70s
Epoch:3, Training loss: 17040.05, Training acc: 0.32, val loss: 5704.93, val acc: 0.30, time: 48.17s
Epoch:4, Training loss: 16864.96, Training acc: 0.32, val loss: 5662.48, val acc: 0.31, time: 49.01s
Epoch:5, Training loss: 16708.41, Training acc: 0.33, val loss: 5655.26, val acc: 0.31, time: 50.81s
Epoch:6, Training loss: 16618.48, Training acc: 0.33, val loss: 5641.88, val acc: 0.31, time: 49.66s
Epoch:7, Training loss: 16519.97, Training acc: 0.33, val loss: 5629.88, val acc: 0.31, time: 48.70s
Epoch:8, Training loss: 16420.18, Training acc: 0.34, val loss: 5627.28, val acc: 0.31, time: 48.63s
Epoch:9, Training loss: 16336.50, Training acc: 0.34, val loss: 5613.63, val acc: 0.31, time: 50.24s
Epoch:10, Training loss: 16228.78, Training acc: 0.34, val loss: 5598.23, val acc: 0.31, ti