In [65]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
from TorchCRF import CRF


# Define parameters
MAX_SEQUENCE_LENGTH = 25  # Adjust as needed
EMBEDDING_DIM = 100
VOCAB_SIZE = 8000
HIDDEN_DIM = 64
BATCH_SIZE = 16
EPOCHS = 10

# Labels and mapping
LABELS = ['O', 'B-Task', 'I-Task', 'B-Date', 'I-Date', 'B-Time', 'I-Time']
NUM_CLASSES = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [66]:
df = pd.read_csv('NER_Data.csv')
df.head(10)

Unnamed: 0,Task,Label
0,attend the keyboard before two pm today and th...,"B-Task,I-Task,I-Task,O,B-Time,I-Time,B-Date,O,O,O"
1,read carnival tickets today at 5:25 pm.,"B-Task,I-Task,I-Task,B-Date,O,B-Time,I-Time"
2,prepare a dentist appointment for next Thursda...,"B-Task,O,B-Task,I-Task,O,B-Date,I-Date,O,B-Tim..."
3,I have to plan the presentation for the client...,"O,O,O,B-Task,O,B-Task,I-Task,I-Task,I-Task,I-T..."
4,book mail confirmation today at 12:20,"B-Task,I-Task,I-Task,B-Date,O,B-Time"
5,deal with the monthly expense summary by the d...,"B-Task,I-Task,O,B-Task,I-Task,I-Task,O,O,O,O,B..."
6,test tickets for the carnival on Sep 31st at 1...,"B-Task,I-Task,I-Task,O,B-Task,O,B-Date,I-Date,..."
7,complete a new tooth brush by tomorrow at noon,"B-Task,I-Task,I-Task,I-Task,I-Task,O,B-Date,O,..."
8,I need to complete chapter eight of the biolog...,"O,O,O,B-Task,I-Task,I-Task,O,O,O,O,O,B-Date"
9,I'm having a exhibition with Shani at the cafe...,"O,O,O,B-Task,I-Task,I-Task,O,O,O,B-Date,O,B-Time"


In [67]:
# Example training data
task_examples = df['Task'].astype(str)
print(task_examples[:3])

y_train = df['Label'].apply(lambda x: x.split(","))

# Tokenizer class
tokenizer = defaultdict(lambda: 1)  # Unknown words map to index 1
tokenizer.update({word: idx+2 for idx, word in enumerate(set(" ".join(task_examples).split()))})  # Start from index 2

# Process data
X_train = [[tokenizer[word] for word in example.split()] for example in task_examples]
X_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in X_train]  # Pad to max length
y_train_indices = [[label2idx.get(label, 0) for label in sent] for sent in y_train]
y_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in y_train_indices]

0    attend the keyboard before two pm today and th...
1              read carnival tickets today at 5:25 pm.
2    prepare a dentist appointment for next Thursda...
Name: Task, dtype: object


In [68]:
# PyTorch Dataset class
class TaskDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TaskDataset(X_train_padded, y_train_padded)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [69]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.25)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # BiLSTM doubles hidden size
        self.crf = CRF(num_classes)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        emissions = self.fc(x)

        if tags is not None:  # Training
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:  # Prediction
            return self.crf.viterbi_decode(emissions, mask=mask)

In [70]:
# Instantiate model
model = BiLSTM_NER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [71]:
def train_model(model, dataloader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in dataloader:
            mask = (X_batch != 0)  # Mask for padded tokens; True where not padded
            optimizer.zero_grad()
            loss = model(X_batch, y_batch, mask)  # CRF loss
            # Ensure loss is a scalar
            if loss.dim() > 0:  # If loss is a tensor (e.g., one per sequence)
                loss = loss.mean()  # Reduce to scalar by averaging over batch
            # print(f"Loss shape: {loss.shape}, Loss value: {loss.item()}")  # Debug print
            loss.backward()  # Backpropagate
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")
    print("Training complete!")

train_model(model, dataloader, EPOCHS)

Epoch 1/10, Avg Loss: 2.4767
Epoch 2/10, Avg Loss: 0.0321
Epoch 3/10, Avg Loss: 0.0108
Epoch 4/10, Avg Loss: 0.0053
Epoch 5/10, Avg Loss: 0.0031
Epoch 6/10, Avg Loss: 0.0019
Epoch 7/10, Avg Loss: 0.0013
Epoch 8/10, Avg Loss: 0.0009
Epoch 9/10, Avg Loss: 0.0007
Epoch 10/10, Avg Loss: 0.0006
Training complete!


In [72]:
def predict(sentence):
    model.eval()
    tokens = [tokenizer[word] for word in sentence.split()]
    padded = tokens + [0] * (MAX_SEQUENCE_LENGTH - len(tokens))
    input_tensor = torch.tensor([padded], dtype=torch.long)
    mask = (input_tensor != 0)
    with torch.no_grad():
        preds = model(input_tensor, mask=mask)[0]  # CRF decode returns list
    return [idx2label[idx] for idx in preds[:len(tokens)]]

In [73]:
# Test predictions
test_sentences = [
    "practice presentation slides for the client pitch on June 15th at two pm",
    "on Oct 22nd at 17 Review the project timeline document",
    "Book a rideshare for the airport which is on Thursday by tomorrow at 20:00"
]
for sentence in test_sentences:
    pred = predict(sentence)
    print(f"\nSentence: {sentence}")
    print(f"Predicted: {pred}")


Sentence: practice presentation slides for the client pitch on June 15th at two pm
Predicted: ['B-Task', 'B-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'I-Task', 'O', 'B-Date', 'I-Date', 'O', 'B-Time', 'I-Time']

Sentence: on Oct 22nd at 17 Review the project timeline document
Predicted: ['O', 'B-Date', 'I-Date', 'O', 'B-Time', 'B-Task', 'O', 'B-Task', 'I-Task', 'I-Task']

Sentence: Book a rideshare for the airport which is on Thursday by tomorrow at 20:00
Predicted: ['B-Task', 'I-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'O', 'O', 'O', 'O', 'O', 'B-Date', 'O', 'B-Time']


In [74]:
# Check CRF transition matrix after training
print(model.crf.trans_matrix)

Parameter containing:
tensor([[-0.0303,  0.1325, -0.1506,  0.0815, -0.2427,  0.1238, -0.2368],
        [-0.0589, -0.3052,  0.1377,  0.0316, -0.1144, -0.1309, -0.1067],
        [ 0.0556, -0.2759,  0.1901, -0.1957, -0.0267, -0.1743,  0.0105],
        [-0.0610, -0.1834, -0.0475, -0.3564,  0.1272, -0.1917, -0.1895],
        [ 0.1195, -0.1703, -0.0864, -0.0348,  0.0472, -0.1084, -0.0374],
        [-0.1965, -0.0495, -0.0035, -0.0940, -0.0237, -0.1229,  0.2966],
        [-0.1274, -0.0850, -0.0653,  0.2233, -0.0810, -0.0068, -0.2037]],
       requires_grad=True)
