In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from huggingface_hub import login
import json
import csv

In [2]:
# Replace 'updated_xprl.json' with the path to your JSON file
with open('/Updated_XBRL.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)


In [3]:
# Replace 'output.csv' with your desired output file name
with open('/resultFile.csv', 'w', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file, quoting=csv.QUOTE_MINIMAL)

    # Write the header row
    csv_writer.writerow(['id', 'query','text', 'answer', 'generated answer'])
        # Loop through each entry
    for entry in data:
        # Extract fields, using .get() to handle missing keys
        id = entry.get('id', '')
        query = entry.get('query', '')
        text = entry.get('text', '')  # Assuming 'text' is the generated answer
        answer = entry.get('answer', '')
        generated_answer= "NAN"

        # Write the row to the CSV file
        csv_writer.writerow([id, query, text, answer,generated_answer])

In [4]:
csv_file = '/resultFile.csv'
df = pd.read_csv(csv_file)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", token= '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')
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct",token='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')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [None]:

# Process each row
import gc
torch.cuda.empty_cache()
gc.collect()
for index, row in df.iloc[1:3].iterrows():
    query = row['query']
    text = row['text']

    # Create the prompt by combining query and text
    prompt = f"{query}\n{text}"

    # Prepare the messages for the model
    messages = [{"role": "user", "content": prompt}]

    # Tokenize the message
    tokenized_message = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        return_dict=True
    )

    # Generate the model's response
    response_token_ids = model.generate(
        tokenized_message['input_ids'].cuda(),
        attention_mask=tokenized_message['attention_mask'].cuda(),
        max_new_tokens=256,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated tokens
    generated_tokens = response_token_ids[:, len(tokenized_message['input_ids'][0]):]
    generated_text = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True
    )[0].strip()

    # Update the 'generated answer' column
    df.at[index, 'generated answer'] = generated_text

    del input_ids, attention_mask, response_token_ids, generated_tokens
    torch.cuda.empty_cache()
    gc.collect()

    # Optional: Print progress
    print(f"Processed row {index + 1}/{len(df)}")


# Save the updated DataFrame to a new CSV file
df.to_csv('/resultFile_with_answers.csv', index=False)

print("The 'generated answer' column has been updated and saved to '/resultFile_with_answers.csv'.")