In [None]:
import json
import torch
import numpy as np
import spacy
import os
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline


In [None]:
sub_cirs = range(21916, 21916+50)
dataset_manual_annotation_folder = "dataset_manual_annotation_v1"
    #[21916, 21917,21923, 21924, 21930,21941,22115,22281,22942]
#sub_cirs = [21917]
print([c for c in sub_cirs])


In [None]:
def get_circulars():
    json_circulars = []
    for sub_cir in sub_cirs:
        with open('./archive.json/{}.json'.format(sub_cir), 'r') as f:
            data = json.load(f)
            json_circulars.append(data)
    return json_circulars

In [None]:
output = get_circulars()
print(output[0])

In [None]:
remote_model_path = 'kusha7/astrobert-gcn-tokenizer'
hf_token=os.environ["hf_token"]
tokenizer = AutoTokenizer.from_pretrained(remote_model_path, token=hf_token)
model = AutoModelForTokenClassification.from_pretrained(
    "kusha7/astrobert-gcn-tokenizer", token=hf_token
)
# ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")

In [None]:
nlp = spacy.load("en_core_web_sm")
label_list = model.config.id2label
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
def merge_subwords(tokens):
    merged = []
    buffer = ""
    buffer_start = None
    buffer_end = None
    buffer_label = None

    for token_info in tokens:
        token = token_info['token']
        start = token_info['start']
        end = token_info['end']
        label = token_info['label']
        if token == '[CLS]' or token == '[SEP]':
            continue

        if token.startswith('##'):
            # It's a subword — merge with buffer
            buffer += token[2:]
            buffer_end = end
        else:
            # Commit previous buffer if exists
            if buffer:
                merged.append({
                    "token": buffer,
                    "start": buffer_start,
                    "end": buffer_end,
                    "label": buffer_label
                })
                buffer = ""
                buffer_start = buffer_end = buffer_label = None

            # Start new buffer
            buffer = token
            buffer_start = start
            buffer_end = end
            buffer_label = label

    # Commit the final buffer
    if buffer:
        merged.append({
            "token": buffer,
            "start": buffer_start,
            "end": buffer_end,
            "label": buffer_label
        })

    return merged


In [None]:
def get_att(cir):
    text = cir["body"]
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]
    # print(sentences)
    sentence_level_tokenization = []
    for sent in sentences:
        # print("sent", sent)
        sentence_results = []
        inputs = tokenizer(sent, return_tensors="pt", return_offsets_mapping=True, truncation=True)
        offset_mapping = inputs.pop("offset_mapping")
        # Move input tensors to device
        inputs = {k: v.cpu() for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs, return_dict=True)

        logits = outputs.logits.cpu()

        predictions = torch.argmax(logits, dim=2)

        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
        offsets = offset_mapping[0].tolist()
        for token, offset, pred_id in zip(tokens, offsets, predictions[0]):
            label = label_list[pred_id.item()]
            sentence_results.append({
                "token": token,
                "start": offset[0],
                "end": offset[1],
                "label": label
            })
        sentence_level_tokenization.append({
            "sent": sent,
            "result":merge_subwords(sentence_results)})
        # Output includes all tokens — even those labeled "O"
    return sentence_level_tokenization
def generate_partial_sentence_level_bio_format(json_circulars):
    new_list = []
    for sub_cir in json_circulars:
        new_obj = sub_cir.copy()
        text = sub_cir["body"]
        new_obj["body"] = text.replace("\n", " ")
        new_obj["new_pipeline_output"] = get_att(new_obj)
        new_list.append(new_obj)
    return new_list

In [None]:
output_bio_2 = generate_partial_sentence_level_bio_format(output)
# print(output_bio_2[0]["body"])
print(output_bio_2[0]["new_pipeline_output"])


In [None]:
import os
def write_bio(output_path, cir):
    with open(output_path, "w", encoding="utf-8") as f:
        sentence_level_arr = cir["new_pipeline_output"]
        to_be_saved = []
        #print("sentence_level_arr", len(sentence_level_arr))
        for sentence in sentence_level_arr:
            results = sentence["result"]
            sent = sentence["sent"]
            #print("results", results)
            to_be_saved.append({
                "sentence": sent,
                "list": [{"token": r["token"], "label":r["label"]} for r in results]
            })

        final_json = json.dumps(to_be_saved, indent = 2)
        f.write(final_json)

def create_bio_files(outputs):
    os.makedirs(dataset_manual_annotation_folder, exist_ok=True)
    for bio_output in outputs:
        # print("bio_output", bio_output)
        write_bio(f"{dataset_manual_annotation_folder}/{bio_output['circularId']}.json", bio_output)

create_bio_files(output_bio_2)