In [2]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from huggingface_hub import login


In [4]:
json_file = '/XBRL_part1.json'
df = pd.read_json(json_file)

In [5]:
# Display the first few rows of the dataframe
print(df.head())

     id                                              query  \
0  3525  What is the US GAAP XBRL tag for Accounts Paya...   

                                                text         answer  
0  file:c-826.xml\n<{http://fasb.org/us-gaap/2023...  Answer:f-2543  


In [6]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B", token= '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')
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B",token='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')

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [8]:
# 0-shot, answering questions every time in a size of 5
def generate_responses(questions, batch_size=5):
    responses = []
    tokenizer.pad_token = tokenizer.eos_token
    for i in range(0, len(questions), batch_size):
        batch_questions = questions[i:i + batch_size]
        inputs = tokenizer.batch_encode_plus(
            batch_questions,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to('cuda')

        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length= 750,  # Set to the maximum length you need
            pad_token_id=tokenizer.eos_token_id,  # Set the end token
            no_repeat_ngram_size=2  # Optional: Prevent the generation of repeated 2-length n-grams
        )

        batch_responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
        responses.extend(batch_responses)

    return responses

In [9]:
df['evidence_text'] = generate_responses(df['query'].tolist())

In [13]:
# Save the updated dataframe to a new CSV file
output_json_file = 'part1eval.json'
df.to_json(output_json_file, index=False)

In [15]:
print(f"Responses have been generated and saved to {output_json_file}")

Responses have been generated and saved to part1eval.json


In [12]:
# Load and display the first few rows of the output CSV file for verification
output_df = pd.read_json(output_json_file, orient='records', lines=True)
print(output_df.head())

            id                                              query  \
0  {'0': 3525}  {'0': 'What is the US GAAP XBRL tag for Accoun...   

                                                text                  answer  \
0  {'0': 'file:c-826.xml
<{http://fasb.org/us-gaa...  {'0': 'Answer:f-2543'}   

                                       evidence_text  
0  {'0': 'What is the US GAAP XBRL tag for Accoun...  
