Execute `huggingface-cli delete-cache` in the terminal to select which models you want to clear from the cache.

In [1]:
import pickle

from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

import helpers
from helpers import Paper

In [2]:
# Load the papers

with open('papers.pkl', 'rb') as f:
    papers: list[Paper] = pickle.load(f)

## Causal Language Modeling

In [3]:
# Load the model

model_name = 'facebook/opt-125m' # 251.9MB
# model_name = 'gpt2' # 551.0MB

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [4]:
prompt_template = """Here is the title and abstract of a scientific paper:
Title: {paper_title}
Abstract: {paper_abstract}
Please classify the paper into one of the following categories:
0. Tables
1. Classification
2. Key Information Extraction
3. Optical Character Recognition
4. Datasets
5. Document Layout Understanding
If the paper does not fit into any of the above categories, please select '6'.
The number of the correct category for this paper is:
"""

In [5]:
output_texts = []

for paper in tqdm(papers):    
    input_text = prompt_template.format(paper_title=paper.title, paper_abstract=paper.abstract)
    input = tokenizer(input_text, return_tensors='pt')
    output = model.generate(**input, max_new_tokens=1)
    output_text = tokenizer.decode(output[0][-1], skip_special_tokens=True)
    output_texts.append(output_text)

predictions = list(map(lambda index: helpers.categories[int(index)], output_texts))

100%|██████████| 148/148 [02:51<00:00,  1.16s/it]


In [6]:
# Save the predictions

helpers.save_to_csv(papers, predictions, 'generation-clm.csv')

helpers.save_to_json(papers, predictions, 'generation-clm.json')

## Instruction-tuned Language Modeling

In [7]:
# Load the model

model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 3.1GB (long inference time)

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
instruction = """Please classify the given paper into one of the following categories:
0. Tables
1. Classification
2. Key Information Extraction
3. Optical Character Recognition
4. Datasets
5. Document Layout Understanding
If the paper does not fit into any of the above categories, please select '6'.
"""

prompt_template = """Here is the title and abstract of a scientific paper:
Title: {paper_title}
Abstract: {paper_abstract}
The number of the correct category for this paper is:
"""

In [11]:
messages = []
responses = []

for i in tqdm(range(len(papers))):
    
    input_text = prompt_template.format(paper_title=paper.title, paper_abstract=paper.abstract)

    message = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": input_text}
    ]

    messages.append(message)

    if len(messages) == 8 or i == len(papers) - 1:

        texts = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = tokenizer(texts, return_tensors="pt")
        generated_ids = model.generate(**model_inputs, max_new_tokens=1)
        response = tokenizer.batch_decode(generated_ids[:,-1], skip_special_tokens=True)
        responses.extend(response)

        messages = []

predictions = list(map(lambda index: helpers.categories[int(index)], responses))

  0%|          | 0/148 [00:00<?, ?it/s]

100%|██████████| 148/148 [31:59<00:00, 12.97s/it]


In [12]:
# Save the predictions

helpers.save_to_csv(papers, predictions, 'generation-itlm.csv')

helpers.save_to_json(papers, predictions, 'generation-itlm.json')