In [None]:
!pip install transformers inflect



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json

def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl('/content/drive/MyDrive/weak/release/ontonotes/augmented_train.json')
dev_data = load_jsonl('/content/drive/MyDrive/weak/release/ontonotes/g_dev.json')


In [None]:
from transformers import BertTokenizer, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

def generate_weak_label(sentence, mention):
    prompt = f"A [MASK] such as {mention}."
    inputs = tokenizer(prompt, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = outputs.logits
    masked_index = torch.where(inputs.input_ids == tokenizer.mask_token_id)[1]
    predicted_token_id = predictions[0, masked_index, :].argmax(axis=-1)
    predicted_token = tokenizer.decode(predicted_token_id)
    return predicted_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
from torch import optim
from torch.optim import AdamW
from tqdm import tqdm
import os

train_data_subset = train_data[:2000]

class EntityTypingDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.examples = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        text = f"{' '.join(example['left_context_token'])} [MENTION] {example['mention_span']} [/MENTION] {' '.join(example['right_context_token'])}"
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        label = example['y'][0] if isinstance(example['y'], list) else example['y']
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs

In [None]:
train_dataset = EntityTypingDataset(train_data_subset, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=10000)
model.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
%cd /content/drive/MyDrive/weak

/content/drive/.shortcut-targets-by-id/1pCHxTyXCsZkRLYkIitI8qk1g_VwmQRj2/weak


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

for epoch in range(3):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} completed. Avg Loss: {total_loss / len(train_loader):.4f}")



Epoch 1 completed. Avg Loss: 8.0140




Epoch 2 completed. Avg Loss: 5.6660




Epoch 3 completed. Avg Loss: 4.6563
Model saved to /content/drive/MyDrive/weak/bert_entity_typing.pt


In [None]:
save_path = "/content/drive/MyDrive/weak/bert_entity_typing.pt"
torch.save(model.state_dict(), save_path)
model.save_pretrained("bert_entity_typing_model")
tokenizer.save_pretrained("bert_entity_typing_model")
print(f"Model saved to {save_path}")

Model saved to /content/drive/MyDrive/weak/bert_entity_typing.pt


In [None]:
import json

with open("/content/drive/MyDrive/weak/release/ontology/types.txt", "r") as f:
    labels = [line.strip() for line in f if line.strip()]

id2label = {str(i): label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items()}

with open("/content/drive/MyDrive/weak/mappings/label_mapping2.json", "w") as f:
    json.dump(id2label, f, indent=2)

with open("/content/drive/MyDrive/weak/mappings/label2id2.json", "w") as f:
    json.dump(label2id, f, indent=2)

print("Mappings saved.")


Mappings saved.


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json


model_dir = "bert_entity_typing_model2"
tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with open("/content/drive/MyDrive/weak/mappings/label_mapping2.json", "r") as f:
    id2label = json.load(f)

In [None]:
def predict_entity_type(left_context, mention, right_context):
    text = f"{left_context} [MENTION] {mention} [/MENTION] {right_context}"

    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    predicted_label = id2label.get(str(predicted_class), "UNKNOWN")
    return predicted_label

In [None]:
left = "The book was written by"
mention = "J.K. Rowling"
right = "and it became a bestseller."

predicted_type = predict_entity_type(left, mention, right)
print(f"Predicted Entity Type: {predicted_type}")

Predicted Entity Type: person


In [None]:
left = "last year we all went to"
mention = "Infosys"
right = "and it was an amazing trip."

predicted_type = predict_entity_type(left, mention, right)
print(f"Predicted Entity Type: {predicted_type}")

Predicted Entity Type: organization


In [None]:
class EntityTypingDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.examples = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        text = f"{' '.join(example['left_context_token'])} [MENTION] {example['mention_span']} [/MENTION] {' '.join(example['right_context_token'])}"
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        label = example['y'][0] if isinstance(example['y'], list) else example['y']
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs

test_dataset = EntityTypingDataset(dev_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import numpy as np
import torch

model = BertForSequenceClassification.from_pretrained("bert_entity_typing_model")
tokenizer = BertTokenizer.from_pretrained("bert_entity_typing_model")
model.eval()

predictions = []
true_labels = []

sample_count = 0
max_samples = 150

with torch.no_grad():
    for batch in test_loader:
        if sample_count >= max_samples:
            break

        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        predicted_class = torch.argmax(logits, dim=1)

        batch_size = input_ids.size(0)
        remaining = max_samples - sample_count
        take_n = min(batch_size, remaining)

        predictions.extend(predicted_class[:take_n].cpu().numpy())
        true_labels.extend(labels[:take_n].cpu().numpy())

        sample_count += take_n

predictions = np.array(predictions)
true_labels = np.array(true_labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')

print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")


Accuracy: 58.00%
Precision: 37.06%
Recall: 58.00%
F1 Score: 45.22%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
