In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from collections import defaultdict
#from torchcrf import CRF

# Define parameters
MAX_SEQUENCE_LENGTH = 40  # Adjust as needed
EMBEDDING_DIM = 100
VOCAB_SIZE = 20000
HIDDEN_DIM = 32
BATCH_SIZE = 5
EPOCHS = 3

# Labels and mapping
LABELS = ['O', 'B-Task', 'I-Task', 'B-Date', 'I-Date', 'B-Time', 'I-Time']
NUM_CLASSES = len(LABELS)
label2idx = {label: idx for idx, label in enumerate(LABELS)}
idx2label = {idx: label for label, idx in label2idx.items()}

In [2]:
df = pd.read_csv('NER_Data.csv')
df.head(10)

Unnamed: 0,Task,Label
0,Don't forget to schedule the ceremony tickets ...,"O,O,O,B-Task,O,B-Task,I-Task,O,B-Date,O,B-Time"
1,Don't forget to organize the tournament ticket...,"O,O,O,B-Task,O,B-Task,I-Task,O,B-Date,O,B-Time"
2,study lyrics for the exhibition on Jul 9th at ...,"B-Task,I-Task,I-Task,O,B-Task,O,B-Date,I-Date,..."
3,send the project report by 12am on Sunday,"B-Task,O,B-Task,I-Task,O,B-Time,O,B-Date"
4,make sure to deal with the trip certification ...,"O,O,O,B-Task,I-Task,O,B-Task,I-Task,O,B-Date,O..."
5,organize a new photo album by today at noon,"B-Task,I-Task,I-Task,I-Task,I-Task,O,B-Date,O,..."
6,Remind me to manage the team meeting report be...,"O,O,O,B-Task,O,B-Task,I-Task,I-Task,O,B-Date,O..."
7,mow preliminary idea report by 8:45 pm on Satu...,"B-Task,I-Task,I-Task,I-Task,O,B-Time,I-Time,O,..."
8,check a new photo album by today at noon,"B-Task,I-Task,I-Task,I-Task,I-Task,O,B-Date,O,..."
9,read preliminary idea report by 2:00 am on Monday,"B-Task,I-Task,I-Task,I-Task,O,B-Time,I-Time,O,..."


In [3]:
# Example training data
task_examples = df['Task'].astype(str)
print(task_examples[:3])

y_train = df['Label'].apply(lambda x: x.split(","))

# Tokenizer class
tokenizer = defaultdict(lambda: 1)  # Unknown words map to index 1
tokenizer.update({word: idx+2 for idx, word in enumerate(set(" ".join(task_examples).split()))})  # Start from index 2

# Process data
X_train = [[tokenizer[word] for word in example.split()] for example in task_examples]
X_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in X_train]  # Pad to max length
y_train_indices = [[label2idx.get(label, 0) for label in sent] for sent in y_train]
y_train_padded = [seq + [0] * (MAX_SEQUENCE_LENGTH - len(seq)) for seq in y_train_indices]

0    Don't forget to schedule the ceremony tickets ...
1    Don't forget to organize the tournament ticket...
2    study lyrics for the exhibition on Jul 9th at ...
Name: Task, dtype: object


In [4]:
# PyTorch Dataset class
class TaskDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TaskDataset(X_train_padded, y_train_padded)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

In [5]:
# Model definition
class BiLSTM_NER(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(BiLSTM_NER, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)  # BiLSTM doubles hidden size
        # self.crf = CRF(num_classes, batch_first=True)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.dropout(x)
        emissions = self.fc(x)
        return emissions

    def loss(self, emissions, tags, mask):
        return -self.crf(emissions, tags, mask=mask, reduction='mean')

    def predict(self, emissions, mask):
        return self.crf.decode(emissions, mask=mask)

In [6]:
# Instantiate model
model = BiLSTM_NER(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [7]:
def train_model(model, dataloader, optimizer, criterion, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for X_batch, y_batch in dataloader:
            optimizer.zero_grad()
            emissions = model(X_batch)
            loss = criterion(emissions.view(-1, NUM_CLASSES), y_batch.view(-1))  # Flatten for loss computation
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

train_model(model, dataloader, optimizer, criterion, EPOCHS)


Epoch 1/3, Loss: 0.5279
Epoch 2/3, Loss: 0.0347
Epoch 3/3, Loss: 0.0093


In [8]:
def predict(sentence):
    model.eval()
    tokens = [tokenizer[word] for word in sentence.split()]
    padded = tokens + [0] * (MAX_SEQUENCE_LENGTH - len(tokens))
    input_tensor = torch.tensor([padded], dtype=torch.long)
    mask = (input_tensor != 0).bool()  # Ensure proper mask format
    with torch.no_grad():
        emissions = model(input_tensor)
        predictions = torch.argmax(emissions, dim=-1).squeeze(0)  # Get highest probability index
    return [idx2label[idx.item()] for idx in predictions[:len(tokens)]]

In [11]:
print(predict("Don't forget to schedule the ceremony tickets by tomorrow at 12:00"))

['B-Task', 'B-Task', 'B-Task', 'B-Task', 'B-Task', 'B-Task', 'I-Task', 'I-Task', 'B-Date', 'B-Time', 'B-Time']
