In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install pytorch-lightning -q -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m716.4/716.4 KB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 KB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 KB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 KB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Preprocess the Penn Treebank dataset
# Load the dataset
import torch

def read_data(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
      parsed_tokens = []
      for line in f:
            if line == '\n' or line[0] == '#':
                if parsed_tokens:
                  parsed_data.append(parsed_tokens)
                parsed_tokens = []
            else:
                parts = line.strip().split(' _ _ ')
                word = parts[0]
                tag = parts[1] if len(parts) > 1 else ''
                parsed_tokens.append((word, tag))
    return parsed_data

train_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/EN-English/en_train.conll')
dev_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/EN-English/en_dev.conll')
test_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/EN-English/en_test.conll')

for i in range(100):
    print(train_dataset[i])
 

[('robert', 'B-OtherPER'), ('gottschalk', 'I-OtherPER'), ('1939', 'O'), ('academy', 'B-VisualWork'), ('award', 'I-VisualWork'), ('winner', 'O'), ('and', 'O'), ('founder', 'O'), ('of', 'O'), ('panavision', 'B-ORG')]
[('during', 'O'), ('the', 'O'), ('reign', 'O'), ('of', 'O'), ('the', 'O'), ('tongzhi', 'B-OtherPER'), ('emperor', 'I-OtherPER'), ('(', 'O'), ('r', 'O'), ('.', 'O'), ('1861', 'O'), ('–', 'O'), ('1875', 'O'), (')', 'O'), (':', 'O')]
[('further', 'O'), ('research', 'O'), ('led', 'O'), ('in', 'O'), ('the', 'O'), ('1960s', 'O'), ('to', 'O'), ('the', 'O'), ('bahadur', 'B-OtherPER'), ('representation', 'O'), ('which', 'O'), ('provides', 'O'), ('information', 'O'), ('about', 'O'), ('the', 'O'), ('errorbounds', 'O'), ('.', 'O')]
[('the', 'O'), ('ideas', 'O'), ('were', 'O'), ('introduced', 'O'), ('by', 'O'), ('william', 'B-OtherPER'), ('burnside', 'I-OtherPER'), ('at', 'O'), ('the', 'O'), ('end', 'O'), ('of', 'O'), ('the', 'O'), ('nineteenth', 'O'), ('century', 'O'), ('.', 'O')]
[('th

In [4]:
SEQ_LEN = 30

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

In [5]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [6]:
# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")
# for i in range(100):
#     print(train_X[i])

Number of training examples: 16778
Number of validation examples: 871
Number of testing examples: 249980


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [8]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 200
HIDDEN_DIM    = 200
NUM_EPOCHS    = 10 
BATCH_SIZE    = 10

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [9]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 48.4 M
1 | lstm      | LSTM             | 643 K 
2 | fc        | Linear           | 27.3 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
49.1 M    Trainable params
0         Non-trainable params
49.1 M    Total params
196.404   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_0/checkpoints/epoch=5-step=10068.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_0/checkpoints/epoch=5-step=10068.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.4338398873806}]

In [10]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00   3727316
B-AerospaceManufacturer       0.39      0.40      0.39      1015
  B-AnatomicalStructure       0.40      0.25      0.31      5838
              B-ArtWork       0.16      0.22      0.19      1270
               B-Artist       0.58      0.52      0.55     57034
              B-Athlete       0.50      0.48      0.49     27624
      B-CarManufacturer       0.43      0.30      0.36      2984
               B-Cleric       0.31      0.25      0.28      4732
             B-Clothing       0.20      0.14      0.16      2243
              B-Disease       0.35      0.29      0.32      5622
                B-Drink       0.32      0.19      0.24      2246
             B-Facility       0.43      0.35      0.39     16181
                 B-Food       0.20      0.08      0.11      5317
      B-HumanSettlement       0.57      0.65      0.61     41099
     B-MedicalProcedure 

In [11]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence\t:", end = ' ')
        for i in range(len(x_sent)):
          print(x_sent[i], end = ' ')
        print("\nPredicted tags\t:", end = ' ')
        for i in range(len(y_pred)):
          print(y_pred[i], end = ' ')
        break

Sentence	: the species was described by dietrich brandis after the forester t. f. bourdillon . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> by this time she was competing against a new generation of young drivers including stirling moss and peter collins . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> their son was the opera producer knut hendriksen ( 1944 – 2020 ) . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> he is the younger brother of adam mosseri . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> smes : laura j. van 't veer et al . ( nl ) for their gene based breast cancer test <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> pau became a leading political and intellectual centre under the reign of henry d'albret . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

In [12]:
torch.save(model.state_dict(), '/content/drive/MyDrive/DeepLearning/multiconer2023/EN-English/English_fine.pth')

In [13]:
eng_model = model.load_state_dict(torch.load('/content/drive/MyDrive/DeepLearning/multiconer2023/EN-English/English_fine.pth'))
def get_prediction(sentence):
    # Split the sentence into words
    words = sentence.split()
    for word in words:
      word = word.lower()

    # Get the index of the words
    X = []
    for word in words:
        if word not in word_to_idx:
            X.append(word_to_idx["<UNK>"])
        else:
            X.append(word_to_idx[word])
    
    # Create a tensor
    X = torch.tensor(X).unsqueeze(0)
    
    # Get the predictions
    y_hat = model(X)
    
    # Get the predictions
    predictions = [idx_to_tag[pred] for pred in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

    # Give new sentence
    flag = False
    tag = ""
    new_sentence = []
    for word, pred in zip(words, predictions):
        if pred == "O":
            if flag:
                flag = False
                new_sentence[new_sentence.__len__() - 1] = new_sentence[new_sentence.__len__() - 1] + "[" + tag + "]"
                tag = ""
            new_sentence.append(word)
        elif len(pred) == 0:
            new_sentence.append(word)
        elif pred[0] == "B":
            tag = pred[2:]
            flag = True
            new_sentence.append(word)
        else:
            new_sentence.append(word)
    if flag:
        flag = False
        new_sentence[new_sentence.__len__() - 1] = new_sentence[new_sentence.__len__() - 1] + "[" + tag + "]"
        tag = "" 
    
    return " ".join(new_sentence)
print(get_prediction("Jim bought 300 shares of Acme Corp. in 2022."))


Jim bought 300 shares of Acme Corp. in 2022.[HumanSettlement]
