In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
from TorchCRF import CRF


# Define parameters
MAX_SEQUENCE_LENGTH = 25  # Adjust as needed
EMBEDDING_DIM = 100
VOCAB_SIZE = 8000
HIDDEN_DIM = 64
BATCH_SIZE = 16
EPOCHS = 10

# Labels and mapping
LABELS = ['O', 'B-Task', 'I-Task', 'B-Date', 'I-Date', 'B-Time', 'I-Time']
NUM_CLASSES = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [2]:
df = pd.read_csv('NER_Data.csv')
df.head(10)

Unnamed: 0,Task,Label
0,I need to check a monitor for the team meeting...,"O,O,O,B-Task,I-Task,I-Task,I-Task,O,B-Task,I-T..."
1,pick up Grandpa's medical prescriptions on jan...,"B-Task,I-Task,I-Task,I-Task,I-Task,O,B-Date,I-..."
2,I need to learn a dishes for the project timel...,"O,O,O,B-Task,I-Task,I-Task,I-Task,O,B-Task,I-T..."
3,mow the report and mow it by Sunday at twelve am,"B-Task,O,B-Task,I-Task,I-Task,I-Task,O,B-Date,..."
4,Don't forget to test the protest tickets by to...,"O,O,O,B-Task,O,B-Task,I-Task,O,B-Date,O,B-Time"
5,Remember to watch with Mom about the panel dis...,"O,O,B-Task,I-Task,I-Task,I-Task,I-Task,I-Task,..."
6,book the ruler before 6 am tomorrow and then book,"B-Task,I-Task,I-Task,O,B-Time,I-Time,B-Date,O,O,O"
7,reserve the monthly performance review by the ...,"B-Task,O,B-Task,I-Task,I-Task,O,O,O,O,B-Date,I..."
8,prepare a table at Daniel's for Sunday at 1:50 pm,"B-Task,O,B-Task,I-Task,I-Task,O,B-Date,O,B-Tim..."
9,finalize haircut tickets tomorrow at 11:20 AM.,"B-Task,I-Task,I-Task,B-Date,O,B-Time,I-Time"


In [3]:
# Example training data
task_examples = df['Task'].astype(str)
print(task_examples[:3])

y_train = df['Label'].apply(lambda x: x.split(","))

# Tokenizer class
tokenizer = defaultdict(lambda: 1)  # Unknown words map to index 1
tokenizer.update({word.strip().lower(): idx+2 for idx, word in enumerate(set(" ".join(task_examples).split()))})  # Start from index 2

# Process data
X_train = [[tokenizer[word] for word in example.split()] for example in task_examples]
X_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in X_train]  # Pad to max length
y_train_indices = [[label2idx.get(label, 0) for label in sent] for sent in y_train]
y_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in y_train_indices]

0    I need to check a monitor for the team meeting...
1    pick up Grandpa's medical prescriptions on jan...
2    I need to learn a dishes for the project timel...
Name: Task, dtype: object


In [4]:
# After creating the tokenizer
import pickle

# Save tokenizer as regular dictionary
with open('NERtokenizer.pkl', 'wb') as f:
    pickle.dump(dict(tokenizer), f)

# Save label mappings
with open('label_mappings.pkl', 'wb') as f:
    pickle.dump({'label2idx': label2idx, 'idx2label': idx2label}, f)

In [5]:
# PyTorch Dataset class
class TaskDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TaskDataset(X_train_padded, y_train_padded)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [6]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.25)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # BiLSTM doubles hidden size
        self.crf = CRF(num_classes)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        emissions = self.fc(x)

        if tags is not None:  # Training
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:  # Prediction
            return self.crf.viterbi_decode(emissions, mask=mask)

In [7]:
# Instantiate model
model = BiLSTM_NER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [8]:
def train_model(model, dataloader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in dataloader:
            mask = (X_batch != 0)  # Mask for padded tokens; True where not padded
            optimizer.zero_grad()
            loss = model(X_batch, y_batch, mask)  # CRF loss
            # Ensure loss is a scalar
            if loss.dim() > 0:  # If loss is a tensor (e.g., one per sequence)
                loss = loss.mean()  # Reduce to scalar by averaging over batch
            # print(f"Loss shape: {loss.shape}, Loss value: {loss.item()}")  # Debug print
            loss.backward()  # Backpropagate
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")
    print("Training complete!")

train_model(model, dataloader, EPOCHS)

Epoch 1/10, Avg Loss: 2.0112
Epoch 2/10, Avg Loss: 0.0216
Epoch 3/10, Avg Loss: 0.0073
Epoch 4/10, Avg Loss: 0.0037
Epoch 5/10, Avg Loss: 0.0021
Epoch 6/10, Avg Loss: 0.0013
Epoch 7/10, Avg Loss: 0.0008
Epoch 8/10, Avg Loss: 0.0006
Epoch 9/10, Avg Loss: 0.0004
Epoch 10/10, Avg Loss: 0.0003
Training complete!


In [9]:
def predict(sentence):
    model.eval()
    tokens = [tokenizer[word.strip().lower()] for word in sentence.split()]
    padded = tokens + [0] * (MAX_SEQUENCE_LENGTH - len(tokens))
    input_tensor = torch.tensor([padded], dtype=torch.long)
    mask = (input_tensor != 0)
    with torch.no_grad():
        preds = model(input_tensor, mask=mask)[0]  # CRF decode returns list
    return [idx2label[idx] for idx in preds[:len(tokens)]]

In [10]:
# Test predictions
test_sentences = [
    "practice presentation slides for the client pitch on June 15th at two pm",
    "on Oct 22nd at 17 Review the project timeline document",
    "Book a rideshare for the airport which is on Thursday by tomorrow at 20:00"
]
for sentence in test_sentences:
    pred = predict(sentence)
    print(f"\nSentence: {sentence}")
    print(f"Predicted: {pred}")


Sentence: practice presentation slides for the client pitch on June 15th at two pm
Predicted: ['O', 'B-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'I-Task', 'O', 'B-Date', 'I-Date', 'O', 'B-Time', 'I-Time']

Sentence: on Oct 22nd at 17 Review the project timeline document
Predicted: ['O', 'B-Date', 'I-Date', 'O', 'B-Time', 'B-Task', 'O', 'B-Task', 'I-Task', 'I-Task']

Sentence: Book a rideshare for the airport which is on Thursday by tomorrow at 20:00
Predicted: ['B-Task', 'I-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'O', 'O', 'O', 'O', 'O', 'B-Date', 'O', 'B-Time']


In [11]:
# Check CRF transition matrix after training
print(model.crf.trans_matrix)

Parameter containing:
tensor([[ 0.0182,  0.1741, -0.2277,  0.0909, -0.2753,  0.2018, -0.0731],
        [-0.0271, -0.3580,  0.1660,  0.0556, -0.1402, -0.0040, -0.0111],
        [ 0.1516, -0.1335,  0.0953, -0.0712, -0.1681, -0.0413,  0.0118],
        [-0.0988, -0.1597, -0.1339, -0.1748,  0.2863, -0.1039, -0.0546],
        [ 0.2485, -0.0532, -0.1223, -0.2037, -0.0126, -0.1946, -0.0937],
        [-0.2029,  0.0778, -0.0674, -0.1996, -0.1476, -0.2037,  0.1267],
        [ 0.0148, -0.1261, -0.0482,  0.0389, -0.1065,  0.0317, -0.0370]],
       requires_grad=True)


In [12]:
torch.save(model.state_dict(), 'NERModel.pth')

In [25]:
import dateparser as dp

print(dp.parse("19:00", languages=['en'], settings={'DATE_ORDER': 'DMY', 'PREFER_DATES_FROM': 'future'}).date())

2025-04-03
