# Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv
import torch
from torch import nn
from d2l import torch as d2l
import torch.optim as optim
import torch.nn.functional as F

# Load Dataset

In [2]:
kaggle_data = pd.read_json('train.json')
kaggle_data.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [3]:
# Split the first 90% of the data as the train set
test_size = round(len(kaggle_data)*0.1) 
train_size = len(kaggle_data) - test_size

# train_set, test_set = train_test_split(kaggle_data, test_size=len(kaggle_data)-train_size, random_state = False)
train_set = kaggle_data.iloc[:train_size].copy()
test_set = kaggle_data.iloc[train_size:].copy()

test_size

681

In [4]:
train_set

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."
...,...,...,...,...,...
6121,21199,Reflection- Storytelling\n\nCHALLENGE AND SELE...,"[Reflection-, Storytelling, \n\n, CHALLENGE, A...","[True, False, False, True, True, False, False,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6122,21203,Example Reflection - Learning Launch\n\nChalle...,"[Example, Reflection, -, Learning, Launch, \n\...","[True, True, True, True, False, False, False, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6123,21204,Reflection - Storytelling\n\nChallenge:​ Worki...,"[Reflection, -, Storytelling, \n\n, Challenge:...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
6124,21205,Challenge and choice\n\nI worked in a packagin...,"[Challenge, and, choice, \n\n, I, worked, in, ...","[True, True, False, False, True, True, True, T...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [5]:
essays = kaggle_data['full_text'].tolist()
tokens = kaggle_data['tokens'].tolist()

# Word Embedding

In [6]:
glove_embeddings = {}
with open('glove_model/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

In [7]:
embedded_dataset = []
for row in kaggle_data["tokens"]:
    embedded_essay = []
    for token in row:
        if token in glove_embeddings:
            embedded_essay.append(glove_embeddings[token])
        else:
            # Handle out-of-vocabulary tokens
            # For example, initialize their embedding randomly or use a special token
            embedded_essay.append(np.random.randn(200))  # 100 is the embedding dimension
    
    embedded_dataset.append(embedded_essay)

# Assign the embedded essays to each document
kaggle_data["embedded_text"] = embedded_dataset
kaggle_data.head(1)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,embedded_text
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...","[[0.07101683107229313, -0.6280221442458723, 0...."


In [8]:
embedded_dataset[0][0]

array([ 7.10168311e-02, -6.28022144e-01,  1.73851493e-01,  1.77204999e+00,
        5.63331059e-01, -1.00989706e+00,  2.29561850e+00, -1.95845441e-01,
       -6.87177654e-01,  1.28750875e+00, -1.61924013e+00, -1.68458653e+00,
       -5.73125371e-01, -7.19558884e-03,  5.05147699e-01, -1.93316333e+00,
        9.75283636e-01,  3.94675691e-02,  1.42435607e+00,  1.20307287e-01,
        9.29570746e-01, -4.63360819e-01, -2.63027792e+00,  1.29838464e+00,
        7.55092376e-01,  8.41792363e-01,  6.57323745e-01, -5.15819643e-01,
       -1.94542551e+00, -1.23025419e+00,  4.14002472e-01, -2.05754284e+00,
        1.02384414e+00, -6.08672009e-01,  9.53094709e-01, -1.13152883e+00,
        2.96452509e-01,  1.70068669e+00,  2.59738609e-01,  1.07972255e+00,
        1.14373777e+00,  4.23034957e-01, -4.90318705e-01, -1.16892405e+00,
       -1.55665573e+00, -6.41194070e-01, -1.30618059e+00,  2.25097887e-01,
        1.55961541e+00,  2.48917644e-01,  1.78605317e+00, -1.00290539e+00,
       -1.25257213e+00,  

### Sanity check to ensure that the tokens are the same length as the embedded representations

Here, we check to make sure that number of tokens and labels for each document is equal to the number of embedded tokens.

In [9]:
# Number of tokens in the first document
print(len(kaggle_data.iloc[1]['tokens']))

# Number of embeded tokens in the first document
print(len(kaggle_data.iloc[0]['embedded_text']))

# Number of labels in the first document
print(len(kaggle_data.iloc[0]['labels']))

563
753
753


# Define The Data Class

### Batching the training set

We then batch the training data set into batches of size 64. Batching data allows the torch framework 
to take advantage of  parralelization

In [10]:
# batch_size = 64
# batch_dataloader = DataLoader(kaggle_data, batch_size=batch_size, shuffle=True)
# batch_dataloader

### Instantiate our BiLSTM and its variables

In [11]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = []

for index, row in train_set.iterrows():
    tokens = row['tokens']
    tags = row['labels']
    training_data.append((tokens, tags))
word_to_ix = {}
# For each words-list (sentence) and tags-list in each tuple of training_data
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
print(len(word_to_ix))

tag_to_ix = {
    "I-URL_PERSONAL": 0,
    "I-ID_NUM": 1,
    "B-STREET_ADDRESS": 2,
    "B-USERNAME": 3,
    "B-PHONE_NUM": 4,
    "I-PHONE_NUM": 5,
    "I-STREET_ADDRESS": 6,
    "B-EMAIL": 7,
    "B-ID_NUM": 8,
    "B-URL_PERSONAL": 9,
    "I-NAME_STUDENT": 10,
    "B-NAME_STUDENT": 11,
    "O": 12
}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 200
HIDDEN_DIM = 8

51162


In [12]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [13]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)
    # print(tag_scores)

for epoch in range(1):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_ix)
    tag_scores = model(inputs)

NameError: name 'scores_tensor' is not defined

In [14]:
tag_scores

tensor([[-9.2968e+00, -9.1190e+00, -9.6317e+00,  ..., -9.0901e+00,
         -8.8500e+00, -1.3543e-03],
        [-9.0199e+00, -8.9182e+00, -9.1781e+00,  ..., -9.1425e+00,
         -8.6935e+00, -1.5050e-03],
        [-9.4414e+00, -9.5800e+00, -9.3032e+00,  ..., -9.4669e+00,
         -9.0670e+00, -9.6108e-04],
        ...,
        [-9.0629e+00, -8.8353e+00, -9.3428e+00,  ..., -8.9915e+00,
         -8.7519e+00, -1.6096e-03],
        [-8.3668e+00, -7.8202e+00, -8.6253e+00,  ..., -8.3537e+00,
         -7.9248e+00, -3.5752e-03],
        [-9.9570e+00, -1.0078e+01, -1.0163e+01,  ..., -1.0016e+01,
         -1.0143e+01, -5.3117e-04]])

In [15]:
    max_indices = torch.argmax(tag_scores, dim=1)

    # Print the indices
    print(max_indices)

tensor([12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
        12, 12, 12, 12, 12, 12, 12, 12, 