In [1]:
import pickle
from dataclasses import dataclass

from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

Execute `huggingface-cli delete-cache` in the terminal to select which models you want to clear from the cache

In [2]:
@dataclass
class Paper:
    filename: str
    title: str = ''
    authors: str = ''
    abstract: str = ''
    keywords: str = ''
    introduction: str = ''
    
    def __repr__(self):
        return f' filename \n----------\n {self.filename}' + \
               f'\n\n title \n----------\n {self.title}' + \
               f'\n\n authors \n----------\n {self.authors}' + \
               f'\n\n abstract \n----------\n {self.abstract}' + \
               f'\n\n keywords \n----------\n {self.keywords}' + \
               f'\n\n introduction \n----------\n {self.introduction}'

In [4]:
# Load the papers and encode them into embeddings

with open('papers.pkl', 'rb') as f:
    papers: list[Paper] = pickle.load(f)

## Causal Language Modeling

In [27]:
# Load the model

model_name = 'facebook/opt-125m' # 251.9MB
# model_name = 'gpt2' # 551.0MB

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [28]:
prompt_template = """Here is the title and abstract of a scientific paper:
Title: {paper_title}
Abstract: {paper_abstract}
Please classify the paper into one of the following categories:
0. Tables
1. Classification
2. Key Information Extraction
3. Optical Character Recognition
4. Datasets
5. Document Layout Understanding
If the paper does not fit into any of the above categories, please select '6'.
The number of the correct category for this paper is:
"""

In [29]:
output_texts = []

for paper in tqdm(papers):    
    input_text = prompt_template.format(paper_title=paper.title, paper_abstract=paper.abstract)
    input = tokenizer(input_text, return_tensors='pt')
    output = model.generate(**input, max_new_tokens=1)
    output_text = tokenizer.decode(output[0][-1], skip_special_tokens=True)
    output_texts.append(output_text)

print(output_texts)

100%|██████████| 148/148 [03:18<00:00,  1.34s/it]

['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']





## Instruction-tuned Language Modeling

In [5]:
# Load the model

model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 3.1GB

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
instruction = """Please classify the given paper into one of the following categories:
0. Tables
1. Classification
2. Key Information Extraction
3. Optical Character Recognition
4. Datasets
5. Document Layout Understanding
If the paper does not fit into any of the above categories, please select '6'.
"""

prompt_template = """Here is the title and abstract of a scientific paper:
Title: {paper_title}
Abstract: {paper_abstract}
The number of the correct category for this paper is:
"""

In [26]:
messages = []
responses = []

for i in tqdm(range(len(papers))):
    
    input_text = prompt_template.format(paper_title=paper.title, paper_abstract=paper.abstract)

    message = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": input_text}
    ]

    messages.append(message)

    if len(messages) == 8 or i == len(papers) - 1:

        texts = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = tokenizer(texts, return_tensors="pt")

        generated_ids = model.generate(**model_inputs, max_new_tokens=1)

        response = tokenizer.batch_decode(generated_ids[:,-1], skip_special_tokens=True)

        responses.extend(response)

        messages = []

print(responses)

100%|██████████| 148/148 [29:44<00:00, 12.06s/it]

['1', '5', '5', '5', '5', '5', '5', '5', '5', '5', '1', '5', '5', '5', '1', '5', '5', '5', '1', '5', '5', '5', '5', '5', '5', '5', '5', '5', '1', '5', '1', '5', '5', '5', '1', '5', '5', '5', '5', '1', '5', '5', '1', '5', '5', '1', '5', '5', '5', '5', '5', '5', '5', '5', '1', '5', '1', '1', '1', '5', '1', '5', '5', '5', '5', '1', '5', '5', '1', '5', '1', '1', '5', '5', '5', '5', '5', '5', '1', '1', '1', '5', '5', '1', '1', '5', '5', '5', '5', '5', '5', '5', '5', '1', '5', '5', '5', '5', '5', '1', '5', '1', '5', '5', '5', '1', '1', '5', '1', '5', '5', '5', '5', '5', '5', '5', '5', '5', '5', '1', '1', '1', '5', '5', '1', '5', '5', '5', '1', '5', '5', '1', '5', '5', '5', '5', '5', '1', '1', '5', '5', '5', '5', '5', '5', '5', '5', '5']



