In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
from TorchCRF import CRF


# Define parameters
MAX_SEQUENCE_LENGTH = 25  # Adjust as needed
EMBEDDING_DIM = 100
VOCAB_SIZE = 8000
HIDDEN_DIM = 128
BATCH_SIZE = 15
EPOCHS = 10

# Labels and mapping
LABELS = ['O', 'B-Task', 'I-Task', 'B-Date', 'I-Date', 'B-Time', 'I-Time']
NUM_CLASSES = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [43]:
df = pd.read_csv('NER_Data.csv')
df.head(10)

Unnamed: 0,Task,Label
0,finalize Dror on her protest which is on the 1...,"B-Task,I-Task,O,O,B-Task,O,O,B-Date,I-Date,I-D..."
1,organize a taxi for the product launch on Wedn...,"B-Task,O,B-Task,I-Task,O,B-Task,I-Task,O,O,O,B..."
2,book the monthly performance review by the dea...,"B-Task,O,B-Task,I-Task,I-Task,O,O,O,O,B-Date,I..."
3,attend lyrics for the wedding on December 26th...,"B-Task,I-Task,I-Task,O,B-Task,O,B-Date,I-Date,..."
4,test Lia's observance gift before July 21st,"B-Task,I-Task,I-Task,I-Task,O,B-Date,I-Date"
5,plan tickets for the ritual on Jul 26th at 3:1...,"B-Task,I-Task,I-Task,O,B-Task,O,B-Date,I-Date,..."
6,Remind me to learn the math quiz report before...,"O,O,O,B-Task,O,B-Task,I-Task,I-Task,O,B-Date,O..."
7,I need to study chapter 3 of the biology textb...,"O,O,O,B-Task,I-Task,I-Task,O,O,O,O,O,B-Date"
8,reserve the keyboard before seven am tomorrow ...,"B-Task,I-Task,I-Task,O,B-Time,I-Time,B-Date,O,O,O"
9,deal with the monthly expense summary by the d...,"B-Task,I-Task,O,B-Task,I-Task,I-Task,O,O,O,O,B..."


In [44]:
# Example training data
task_examples = df['Task'].astype(str)
print(task_examples[:3])

y_train = df['Label'].apply(lambda x: x.split(","))

# Tokenizer class
tokenizer = defaultdict(lambda: 1)  # Unknown words map to index 1
tokenizer.update({word: idx+2 for idx, word in enumerate(set(" ".join(task_examples).split()))})  # Start from index 2

# Process data
X_train = [[tokenizer[word] for word in example.split()] for example in task_examples]
X_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in X_train]  # Pad to max length
y_train_indices = [[label2idx.get(label, 0) for label in sent] for sent in y_train]
y_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in y_train_indices]

0    finalize Dror on her protest which is on the 1...
1    organize a taxi for the product launch on Wedn...
2    book the monthly performance review by the dea...
Name: Task, dtype: object


In [45]:
# PyTorch Dataset class
class TaskDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TaskDataset(X_train_padded, y_train_padded)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [46]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # BiLSTM doubles hidden size
        self.crf = CRF(num_classes)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        emissions = self.fc(x)

        if tags is not None:  # Training
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:  # Prediction
            return self.crf.viterbi_decode(emissions, mask=mask)

In [47]:
# Instantiate model
model = BiLSTM_NER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [48]:
def train_model(model, dataloader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in dataloader:
            mask = (X_batch != 0)  # Mask for padded tokens; True where not padded
            optimizer.zero_grad()
            loss = model(X_batch, y_batch, mask)  # CRF loss
            # Ensure loss is a scalar
            if loss.dim() > 0:  # If loss is a tensor (e.g., one per sequence)
                loss = loss.mean()  # Reduce to scalar by averaging over batch
            # print(f"Loss shape: {loss.shape}, Loss value: {loss.item()}")  # Debug print
            loss.backward()  # Backpropagate
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")
    print("Training complete!")

train_model(model, dataloader, EPOCHS)

Epoch 1/10, Avg Loss: 1.5100
Epoch 2/10, Avg Loss: 0.0110
Epoch 3/10, Avg Loss: 0.0042
Epoch 4/10, Avg Loss: 0.0015
Epoch 5/10, Avg Loss: 0.0009
Epoch 6/10, Avg Loss: 0.0006
Epoch 7/10, Avg Loss: 0.0004
Epoch 8/10, Avg Loss: 0.0003
Epoch 9/10, Avg Loss: 0.0002
Epoch 10/10, Avg Loss: 0.0001
Training complete!


In [49]:
def predict(sentence):
    model.eval()
    tokens = [tokenizer[word] for word in sentence.split()]
    padded = tokens + [0] * (MAX_SEQUENCE_LENGTH - len(tokens))
    input_tensor = torch.tensor([padded], dtype=torch.long)
    mask = (input_tensor != 0)
    with torch.no_grad():
        preds = model(input_tensor, mask=mask)[0]  # CRF decode returns list
    return [idx2label[idx] for idx in preds[:len(tokens)]]

In [64]:
# Test predictions
test_sentences = [
    "Practice presentation slides for the client pitch on June 15th at two pm",
    "on Oct 22nd at 17 Review the project timeline document",
    "Book a rideshare for the airport which is on Thursday by tomorrow at 20:00"
]
for sentence in test_sentences:
    pred = predict(sentence)
    print(f"\nSentence: {sentence}")
    print(f"Predicted: {pred}")


Sentence: Practice presentation slides for the client pitch on June 15th at two pm
Predicted: ['B-Task', 'B-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'I-Task', 'O', 'B-Date', 'I-Date', 'O', 'B-Time', 'I-Time']

Sentence: on Oct 22nd at 17 Review the project timeline document
Predicted: ['O', 'B-Date', 'I-Date', 'O', 'B-Time', 'I-Time', 'O', 'B-Task', 'I-Task', 'I-Task']

Sentence: Book a rideshare for the airport which is on Thursday by tomorrow at 20:00
Predicted: ['B-Task', 'O', 'B-Task', 'I-Task', 'O', 'B-Task', 'I-Task', 'O', 'O', 'O', 'O', 'B-Date', 'O', 'B-Time']


In [65]:
# Check CRF transition matrix after training
print(model.crf.trans_matrix)

Parameter containing:
tensor([[-0.0427,  0.0094, -0.0598, -0.0056, -0.1978,  0.1632, -0.1330],
        [ 0.0079, -0.0740,  0.1055, -0.0958,  0.0435,  0.0277, -0.0970],
        [ 0.0749, -0.0800,  0.0173, -0.0834, -0.1235, -0.1250,  0.0029],
        [-0.0703, -0.0239, -0.0121, -0.1166,  0.1440,  0.0007, -0.0216],
        [ 0.0070, -0.1092, -0.0860, -0.1130, -0.0519,  0.0119,  0.0534],
        [-0.1515, -0.0805, -0.0239, -0.1142, -0.1024, -0.0490,  0.2186],
        [-0.0868, -0.0040, -0.0533,  0.1305,  0.0279, -0.0442, -0.0942]],
       requires_grad=True)
