# Import Libraries

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv
import torch
from torch import nn
from d2l import torch as d2l

# Load Dataset

In [2]:
kaggle_data = pd.read_json('train.json')
kaggle_data.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [3]:
# Split the first 90% of the data as the train set
test_size = round(len(kaggle_data)*0.1) 
train_size = len(kaggle_data) - test_size

# train_set, test_set = train_test_split(kaggle_data, test_size=len(kaggle_data)-train_size, random_state = False)
train_set = kaggle_data.iloc[:train_size].copy()
test_set = kaggle_data.iloc[train_size:].copy()

test_size

681

In [4]:
essays = kaggle_data['full_text'].tolist()
tokens = kaggle_data['tokens'].tolist()

# Word Embedding

In [5]:
glove_embeddings = {}
with open('glove_model/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.array(values[1:], dtype='float32')
        glove_embeddings[word] = vector

In [6]:
embedded_dataset = []
for row in kaggle_data["tokens"]:
    embedded_essay = []
    for token in row:
        if token in glove_embeddings:
            embedded_essay.append(glove_embeddings[token])
        else:
            # Handle out-of-vocabulary tokens
            # For example, initialize their embedding randomly or use a special token
            embedded_essay.append(np.random.randn(200))  # 100 is the embedding dimension
    embedded_dataset.append(embedded_essay)

In [7]:
embedded_dataset[0]

[array([-1.20084705e+00,  7.57399846e-01,  3.66765762e-01,  4.34886575e-02,
         9.44726944e-01,  1.13731499e+00,  1.74902711e+00,  1.98570033e+00,
        -1.02828351e+00,  1.25434593e-01, -1.52355155e-01, -8.23395525e-01,
         2.45672197e+00,  7.67175732e-01, -3.29470889e-01, -4.60046559e-01,
        -9.12810552e-01,  1.10602353e+00,  1.25699457e-01, -5.28250522e-01,
         5.55688890e-01,  1.37745255e+00,  2.53881208e-01, -8.42666911e-01,
         1.39756218e+00, -8.13217237e-01, -7.33756594e-02, -5.60851621e-03,
        -2.59008680e-01, -7.23769179e-01,  8.71575693e-01, -9.28264580e-01,
         1.03171606e+00,  2.07883159e-01, -8.61727887e-01,  1.50269144e-01,
         8.69201340e-01, -2.19222510e-01, -5.73733239e-01,  1.91024616e+00,
         6.77409154e-01, -1.07821913e+00, -4.17718773e-01, -5.43594675e-01,
        -1.05920840e+00, -8.52881621e-01, -9.05146795e-01,  6.22996983e-01,
         1.85257307e+00, -9.49452714e-01,  1.67392971e-01, -2.09104002e-01,
        -2.6

# Define The Data Class

In [8]:
class KaggleData(d2l.DataModule):
    """ Data for linear regressions """
    def __init__(self, features, targets, num_train, num_val, batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        self.X = features
        self.y = targets

In [9]:
@d2l.add_to_class(KaggleData)  #@save
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

In [10]:
class BiRNNScratch(d2l.Module):
    def __init__(self, num_inputs, num_hiddens, sigma=0.01):
        super().__init__()
        self.save_hyperparameters()
        self.f_rnn = d2l.RNNScratch(num_inputs, num_hiddens, sigma)
        self.b_rnn = d2l.RNNScratch(num_inputs, num_hiddens, sigma)
        self.num_hiddens *= 2  # The output dimension will be doubled

In [11]:
@d2l.add_to_class(BiRNNScratch)
def forward(self, inputs, Hs=None):
    f_H, b_H = Hs if Hs is not None else (None, None)
    f_outputs, f_H = self.f_rnn(inputs, f_H)
    b_outputs, b_H = self.b_rnn(reversed(inputs), b_H)
    outputs = [torch.cat((f, b), -1) for f, b in zip(
        f_outputs, reversed(b_outputs))]
    return outputs, (f_H, b_H)

# BiLSTM

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class BiLSTMNER(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, num_classes, hidden_size):
        super(BiLSTMNER, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, bidirectional=True)
        self.fc = nn.Linear(2*hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = embedded.permute(1, 0, 2)  # LSTM expects (seq_len, batch, features)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out.permute(1, 0, 2)  # Back to (batch, seq_len, features)
        logits = self.fc(lstm_out)
        return logits

# Example usage:
num_embeddings = 10000  # Example number of embeddings
embedding_dim = 100
num_classes = 5  # Number of classes (e.g., B-PER, I-PER, B-LOC, I-LOC, O)
hidden_size = 128
model = BiLSTMNER(num_embeddings, embedding_dim, num_classes, hidden_size)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop (assuming you have a DataLoader named train_loader)
for epoch in range(2):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, num_classes), labels.view(-1))
        loss.backward()
        optimizer.step() 

NameError: name 'train_loader' is not defined