In [86]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
from TorchCRF import CRF


# Define parameters
MAX_SEQUENCE_LENGTH = 25  # Adjust as needed
EMBEDDING_DIM = 100
VOCAB_SIZE = 8000
HIDDEN_DIM = 64
BATCH_SIZE = 16
EPOCHS = 10

# Labels and mapping
LABELS = ['O', 'B-Task', 'I-Task', 'B-Date', 'I-Date', 'B-Time', 'I-Time']
NUM_CLASSES = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [87]:
df = pd.read_csv('NER_Data.csv')
df.head(10)

Unnamed: 0,Task,Label
0,tomorrow i have to test the trousers for the j...,"B-Date,O,O,O,B-Task,O,B-Task,I-Task,O,B-Task,I..."
1,I'm having a trip with Gal at the cafe tomorro...,"O,O,O,B-Task,I-Task,I-Task,O,O,O,B-Date,O,B-Time"
2,Remind me to attend the tech conference report...,"O,O,O,B-Task,O,B-Task,I-Task,I-Task,O,B-Date,O..."
3,Remind me to clean the board meeting report be...,"O,O,O,B-Task,O,B-Task,I-Task,I-Task,O,B-Date,O..."
4,review mail confirmation tomorrow at 9:10,"B-Task,I-Task,I-Task,B-Date,O,B-Time"
5,read gathering tickets today at 11:15 AM.,"B-Task,I-Task,I-Task,B-Date,O,B-Time,I-Time"
6,finish reading chapter 38 of physics project b...,"B-Task,I-Task,I-Task,I-Task,I-Task,I-Task,I-Ta..."
7,practice the report and practice it by Wednesd...,"B-Task,O,B-Task,I-Task,I-Task,I-Task,O,B-Date,..."
8,study the report and study it by Thursday at t...,"B-Task,O,B-Task,I-Task,I-Task,I-Task,O,B-Date,..."
9,buy Daniel about the trade show by next week,"B-Task,I-Task,I-Task,I-Task,I-Task,I-Task,O,B-..."


In [88]:
# Example training data
task_examples = df['Task'].astype(str)
print(task_examples[:3])

y_train = df['Label'].apply(lambda x: x.split(","))

# Tokenizer class
tokenizer = defaultdict(lambda: 1)  # Unknown words map to index 1
tokenizer.update({word.strip().lower(): idx+2 for idx, word in enumerate(set(" ".join(task_examples).split()))})  # Start from index 2

# Process data
X_train = [[tokenizer[word] for word in example.split()] for example in task_examples]
X_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in X_train]  # Pad to max length
y_train_indices = [[label2idx.get(label, 0) for label in sent] for sent in y_train]
y_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in y_train_indices]

0    tomorrow i have to test the trousers for the j...
1    I'm having a trip with Gal at the cafe tomorro...
2    Remind me to attend the tech conference report...
Name: Task, dtype: object


In [89]:
# After creating the tokenizer
import pickle

# Save tokenizer as regular dictionary
with open('NERtokenizer.pkl', 'wb') as f:
    pickle.dump(dict(tokenizer), f)

# Save label mappings
with open('label_mappings.pkl', 'wb') as f:
    pickle.dump({'label2idx': label2idx, 'idx2label': idx2label}, f)

In [90]:
# PyTorch Dataset class
class TaskDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TaskDataset(X_train_padded, y_train_padded)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [91]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.25)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # BiLSTM doubles hidden size
        self.crf = CRF(num_classes)

    def forward(self, x, tags=None, mask=None):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        emissions = self.fc(x)

        if tags is not None:  # Training
            loss = -self.crf(emissions, tags, mask=mask)
            return loss
        else:  # Prediction
            return self.crf.viterbi_decode(emissions, mask=mask)

In [92]:
# Instantiate model
model = BiLSTM_NER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [93]:
def train_model(model, dataloader, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in dataloader:
            mask = (X_batch != 0)  # Mask for padded tokens; True where not padded
            optimizer.zero_grad()
            loss = model(X_batch, y_batch, mask)  # CRF loss
            # Ensure loss is a scalar
            if loss.dim() > 0:  # If loss is a tensor (e.g., one per sequence)
                loss = loss.mean()  # Reduce to scalar by averaging over batch
            # print(f"Loss shape: {loss.shape}, Loss value: {loss.item()}")  # Debug print
            loss.backward()  # Backpropagate
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")
    print("Training complete!")

train_model(model, dataloader, EPOCHS)

Epoch 1/10, Avg Loss: 2.1604
Epoch 2/10, Avg Loss: 0.0250
Epoch 3/10, Avg Loss: 0.0084
Epoch 4/10, Avg Loss: 0.0041
Epoch 5/10, Avg Loss: 0.0023
Epoch 6/10, Avg Loss: 0.0014
Epoch 7/10, Avg Loss: 0.0009
Epoch 8/10, Avg Loss: 0.0006
Epoch 9/10, Avg Loss: 0.0004
Epoch 10/10, Avg Loss: 0.0129
Training complete!


In [94]:
def predict(sentence):
    model.eval()
    tokens = [tokenizer[word.strip().lower()] for word in sentence.split()]
    padded = tokens + [0] * (MAX_SEQUENCE_LENGTH - len(tokens))
    input_tensor = torch.tensor([padded], dtype=torch.long)
    mask = (input_tensor != 0)
    with torch.no_grad():
        preds = model(input_tensor, mask=mask)[0]  # CRF decode returns list
    return [idx2label[idx] for idx in preds[:len(tokens)]]

In [95]:
# Test predictions
test_sentences = [
    "practice presentation slides for the client pitch on June 15th at two pm",
    "on Oct 22nd at 17 Review the project timeline document",
    "Book a rideshare for the airport which is on Thursday by tomorrow at 20:00"
]
for sentence in test_sentences:
    pred = predict(sentence)
    print(f"\nSentence: {sentence}")
    print(f"Predicted: {pred}")


Sentence: practice presentation slides for the client pitch on June 15th at two pm
Predicted: ['B-Task', 'B-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'I-Task', 'O', 'B-Date', 'I-Date', 'O', 'B-Time', 'I-Time']

Sentence: on Oct 22nd at 17 Review the project timeline document
Predicted: ['O', 'B-Date', 'I-Date', 'O', 'B-Time', 'B-Task', 'O', 'B-Task', 'I-Task', 'I-Task']

Sentence: Book a rideshare for the airport which is on Thursday by tomorrow at 20:00
Predicted: ['B-Task', 'I-Task', 'I-Task', 'I-Task', 'O', 'B-Task', 'O', 'O', 'O', 'O', 'O', 'B-Date', 'O', 'B-Time']


In [96]:
# Check CRF transition matrix after training
print(model.crf.trans_matrix)

Parameter containing:
tensor([[ 0.1063,  0.3283, -0.2020,  0.1096, -0.3213,  0.2531, -0.2274],
        [-0.1284, -0.3469,  0.1402,  0.0268, -0.1159, -0.1149, -0.0678],
        [ 0.1827, -0.2965,  0.2179, -0.1539, -0.0948, -0.1242, -0.1741],
        [-0.2225, -0.1072, -0.1741, -0.2618,  0.2292, -0.0291, -0.0937],
        [ 0.0403, -0.0100, -0.0387, -0.1076,  0.1940, -0.1788,  0.0413],
        [-0.1476, -0.0253, -0.2179, -0.2165, -0.0752, -0.1672,  0.2402],
        [-0.0240,  0.0292, -0.1216,  0.1739, -0.0231, -0.0071, -0.2256]],
       requires_grad=True)


In [97]:
torch.save(model.state_dict(), 'NERModel.pth')

In [98]:
import dateparser as dp

print(dp.parse("sunday", languages=['en'], settings={'DATE_ORDER': 'DMY', 'PREFER_DATES_FROM': 'future'}).date())

2025-04-13
