In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
%pip install pytorch-lightning -q -U

In [27]:
# Preprocess the Penn Treebank dataset
# Load the dataset
import torch

def read_data(file_path):
    parsed_data = []
    with open(file_path, 'r', encoding='utf-8') as f:
      parsed_tokens = []
      for line in f:
            if line == '\n' or line[0] == '#':
                if parsed_tokens:
                  parsed_data.append(parsed_tokens)
                parsed_tokens = []
            else:
                parts = line.strip().split(' _ _ ')
                word = parts[0]
                tag = parts[1] if len(parts) > 1 else ''
                parsed_tokens.append((word, tag))
    return parsed_data

train_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/HI-Hindi/hi_train.conll')
dev_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/HI-Hindi/hi_dev.conll')
test_dataset= read_data('/content/drive/MyDrive/DeepLearning/multiconer2023/HI-Hindi/hi_test.conll')

for i in range(100):
    print(train_dataset[i])
 

[('यह', 'O'), ('झियान', 'B-HumanSettlement'), ('चीन', 'B-HumanSettlement'), ('के', 'O'), ('केंद्र', 'O'), ('भाग', 'O'), ('में', 'O'), ('स्थित', 'O'), ('है।', 'O')]
[('२००३', 'O'), ('में', 'O'), ('विंबलडन,', 'B-HumanSettlement'), ('लंदन', 'I-HumanSettlement'), ('में', 'O'), ('एक', 'O'), ('साइकिल', 'O'), ('चालक', 'O'), ('के', 'O'), ('साथ', 'O'), ('टकराव', 'O'), ('के', 'O'), ('दौरान', 'O'), ('रोड्स', 'O'), ('को', 'O'), ('मार', 'O'), ('दिया', 'O'), ('गया', 'O'), ('था।', 'O')]
[('उन्होंने', 'O'), ('अल्जियर्स', 'B-HumanSettlement'), ('में', 'O'), ('राजनीति', 'O'), ('विज्ञान', 'O'), ('का', 'O'), ('अध्ययन', 'O'), ('किया।', 'O')]
[('चार्ल्स', 'O'), ('कोर्डोबा', 'B-HumanSettlement'), ('अमीरात', 'I-HumanSettlement'), ('के', 'O'), ('साथ', 'O'), ('कूटनीति', 'O'), ('में', 'O'), ('लगे', 'O'), ('हुए', 'O'), ('८६५', 'O'), ('में', 'O'), ('मुहम्मद', 'B-Politician'), ('प्रथम', 'I-Politician'), ('कोर्डोबा', 'I-Politician'), ('से', 'O'), ('ऊंट', 'O'), ('प्राप्त', 'O'), ('करते', 'O'), ('हैं।', 'O')]
[('वह', 

In [28]:
SEQ_LEN = 30

# Create word_to_idx and tag_to_idx mappings
word_to_idx = {"<PAD>": 0, "<UNK>": 1}
tag_to_idx = {"<PAD>": 0}


def preprocess(dataset):
    # Extract sentences and tags
    sent = [[token.lower() for token, tag in sentence] for sentence in dataset]
    tags = [[tag for token, tag in sentence] for sentence in dataset]

    for i in range(len(sent)):
        while len(sent[i]) < SEQ_LEN:
            sent[i].append('<PAD>')
            tags[i].append('<PAD>')

        if len(sent[i]) > SEQ_LEN:
            sent[i] = sent[i][:SEQ_LEN]
            tags[i] = tags[i][:SEQ_LEN]
    
    for sentence_tags in tags:
        for tag in sentence_tags:
            if tag not in tag_to_idx:
                tag_to_idx[tag] = len(tag_to_idx)
    
    for sentence in sent:
        for word in sentence:
            if word not in word_to_idx:
                word_to_idx[word] = len(word_to_idx)

    # Convert words and tags to indices
    X = torch.tensor([[word_to_idx.get(word, 1) for word in sentence] for sentence in sent], dtype=torch.int).type(torch.LongTensor)
    Y = torch.tensor([[tag_to_idx[tag] for tag in sentence] for sentence in tags], dtype=torch.int).type(torch.LongTensor)
    
    return X, Y

In [29]:
train_X, train_Y = preprocess(train_dataset)
dev_X, dev_Y = preprocess(dev_dataset)
test_X, test_Y = preprocess(test_dataset)

In [30]:
# Print the sizes of the datasets
print(f"Number of training examples: {len(train_X)}")
print(f"Number of validation examples: {len(dev_X)}")
print(f"Number of testing examples: {len(test_X)}")
# for i in range(100):
#     print(train_X[i])

Number of training examples: 9632
Number of validation examples: 514
Number of testing examples: 18399


In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl

class NERModel(pl.LightningModule):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim) #B * seq_len, B * seq_len * embedding_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers=num_layers, bidirectional=True)
        self.fc = nn.Linear(2*hidden_dim, tagset_size)
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, x):
        embeds = self.embedding(x)
        #print(embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.fc(lstm_out)
        tag_scores = nn.functional.log_softmax(tag_space, dim=2)
        return tag_scores
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('val_loss', loss)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self.forward(x)
        loss = self.loss_fn(y_hat.view(-1, y_hat.shape[-1]), y.view(-1))
        self.log('test_loss', loss)
        return loss
    
    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters())
        return optimizer

In [32]:
from torch.utils.data import DataLoader, TensorDataset
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

EMBEDDING_DIM = 125
HIDDEN_DIM    = 125
NUM_EPOCHS    = 10 
BATCH_SIZE    = 10

train_dataset = TensorDataset(train_X, train_Y)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(dev_X, dev_Y)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_X, test_Y)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [33]:
model = NERModel(vocab_size=len(word_to_idx), tagset_size=len(tag_to_idx), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, mode="min")
trainer = pl.Trainer(max_epochs=NUM_EPOCHS, callbacks=[early_stopping])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

trainer.test(dataloaders=test_loader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type             | Params
-----------------------------------------------
0 | embedding | Embedding        | 3.8 M 
1 | lstm      | LSTM             | 252 K 
2 | fc        | Linear           | 17.1 K
3 | loss_fn   | CrossEntropyLoss | 0     
-----------------------------------------------
4.1 M     Trainable params
0         Non-trainable params
4.1 M     Total params
16.371    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:Restoring states from the checkpoint path at /content/lightning_logs/version_2/checkpoints/epoch=6-step=6748.ckpt
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.utilities.rank_zero:Loaded model weights from the checkpoint at /content/lightning_logs/version_2/checkpoints/epoch=6-step=6748.ckpt


Testing: 0it [00:00, ?it/s]

[{'test_loss': 0.2674734592437744}]

In [34]:
from sklearn.metrics import classification_report

# define idx_to_tag
idx_to_tag = {idx: tag for tag, idx in tag_to_idx.items()}

# define device
device = torch.device('cpu')

# Create a dataloader for the test set
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Set the model to evaluation mode
model.eval()

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]

print(classification_report(y_true, y_pred))

                         precision    recall  f1-score   support

                  <PAD>       1.00      1.00      1.00    258203
B-AerospaceManufacturer       0.21      0.05      0.08        85
  B-AnatomicalStructure       0.68      0.61      0.64       489
              B-ArtWork       0.17      0.00      0.00       426
               B-Artist       0.48      0.40      0.44      1852
              B-Athlete       0.62      0.56      0.59      1171
      B-CarManufacturer       0.78      0.85      0.81       146
               B-Cleric       0.54      0.80      0.65       188
             B-Clothing       0.48      0.82      0.61        77
              B-Disease       0.65      0.65      0.65       633
                B-Drink       0.59      0.65      0.62       135
             B-Facility       0.43      0.47      0.45       859
                 B-Food       0.47      0.60      0.52       428
      B-HumanSettlement       0.76      0.65      0.70      5825
     B-MedicalProcedure 

In [35]:
# Set the model to evaluation mode
model.eval()

idx_to_word = {idx: word for word, idx in word_to_idx.items()}

y_true = []
y_pred = []

with torch.no_grad():
    for x, y in test_loader:
        # Move the data to the device
        x = x.to(device)
        y = y.to(device)

        # Forward pass
        y_hat = model(x)

        # Get back the sentence
        x_sent = [idx_to_word[i] for i in x.cpu().numpy().flatten().tolist()]

        # Compute the predicted tags
        y_pred += [idx_to_tag[i] for i in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

        # Compute the true tags
        y_true += [idx_to_tag[i] for i in y.cpu().numpy().flatten().tolist()]
        print("Sentence\t:", end = ' ')
        for i in range(len(x_sent)):
          print(x_sent[i], end = ' ')
        print("\nPredicted tags\t:", end = ' ')
        for i in range(len(y_pred)):
          print(y_pred[i], end = ' ')
        break

Sentence	: उनकी विशेषताओं आंदोलनों और खेल शैली के कारण उनकी तुलना वाल्टर ज़ेंगा से की गई है। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> एथन काट्ज़ ( जन्म १९८३ ) शिकागो व्हाइट सोक्स के लिए पिचिंग कोच <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> वह प्रसिद्ध रैंडविक रेसकोर्स ट्रेनर इसहाक अर्नशॉ के लिए प्रेरित था और उस समय के कुछ बेहतरीन घोड़ों के लिए सवार था। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> चेल्सी ने उथल -पुथल में मैच में प्रवेश किया उनके प्रबंधक टॉमी डोचेर्टी एक दिन पहले इस्तीफा दे दिया। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> शूप २००४ में मुख्य कोच जॉन ग्रुडेन के तहत बुक्स के लिए क्वार्टरबैक कोच था। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> उनकी प्रतिभा को शुरू में ओटो रेहगेल द्वारा नजरअंदाज कर दिया गया था लेकिन २००७ में निर्वाचक ने उन्हें कैप किया। <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

In [36]:
torch.save(model.state_dict(), '/content/drive/MyDrive/DeepLearning/multiconer2023/HI-Hindi/Hindi_fine.pth')

In [37]:
hindi_model = model.load_state_dict(torch.load('/content/drive/MyDrive/DeepLearning/multiconer2023/HI-Hindi/Hindi_fine.pth'))

def pred_NER(sentence):
    # Split the sentence into words
    words = sentence.split()
    for word in words:
      word = word.lower()

    # Get the index of the words
    X = []
    for word in words:
        if word not in word_to_idx:
            X.append(word_to_idx["<UNK>"])
        else:
            X.append(word_to_idx[word])
    
    # Create a tensor
    X = torch.tensor(X).unsqueeze(0)
    
    # Get the predictions
    y_hat = model(X)
    
    # Get the predictions
    predictions = [idx_to_tag[pred] for pred in y_hat.argmax(-1).cpu().numpy().flatten().tolist()]

    # Give new sentence
    new_sentence = []
    flag = False
    tag = ""
    for word, pred in zip(words, predictions):
        if pred == "O":
            if flag:
                new_sentence[new_sentence.__len__() - 1] = new_sentence[new_sentence.__len__() - 1] + "[" + tag + "]"
                flag = False
                tag = ""
            new_sentence.append(word)
        elif len(pred) == 0:
            new_sentence.append(word)
        elif pred[0] == "B":
            flag = True
            tag = pred[2:]
            new_sentence.append(word)
        else:
            new_sentence.append(word)
    if flag:
        new_sentence[new_sentence.__len__() - 1] = new_sentence[new_sentence.__len__() - 1] + "[" + tag + "]"
        flag = False
        tag = "" 
    
    return " ".join(new_sentence)
print(pred_NER("जिम ने 2022 में एक्मे कॉर्प के 300 शेयर खरीदे।"))


जिम ने 2022[Facility] में एक्मे कॉर्प के 300 शेयर खरीदे।
