### QA Pair Generation 

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


# Check if CUDA is available and set the device
device = 0 if torch.cuda.is_available() else -1
if device == 0:
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Using CPU.")

  from .autonotebook import tqdm as notebook_tqdm


CUDA is available. Using GPU: NVIDIA GeForce RTX 3070 Laptop GPU


In [10]:
# Load the summarized data
input_file = "summarized_cleantech_media_dataset.csv"
data_chunk = pd.read_csv(input_file)

# Display the first few rows to confirm the data structure
print("\nLoaded data with summaries:")
data_chunk[["summary"]].head()


Loaded data with summaries:


Unnamed: 0,summary
0,Qatar Petroleum ( QP) is targeting aggressive ...
1,Kakrapar-3 is the first of India's 700 megawat...
2,New US President Joe Biden took office this we...
3,The slow pace of Japanese reactor restarts con...
4,Two of New York City's largest pension funds s...


In [11]:
data_chunk.shape

(500, 7)

In [12]:
# Initialize the model and tokenizer
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype="auto", trust_remote_code=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.40it/s]


In [13]:
def generate_question(context, max_length=70):
    input_text = f"\n\n{context}\n-------------------\n Generate only one question based on the above context and Just return the Question, nothing else"
    input_ids = tokenizer.encode(
        input_text, return_tensors="pt", max_length=512, truncation=True
    )
    outputs = model.generate(input_ids, max_new_tokens=max_length)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question


# Generate QA pairs for a few rows to test the code
test_data = data_chunk.head(1)  # Use only the first n rows for testing
qa_pairs = []

for index, row in test_data.iterrows():
    summary = row["summary"]
    question = generate_question(summary)
    qa_pairs.append({"summary": summary, "question": question, "answer": summary})

# Save the generated QA pairs in the current directory
qa_pairs_df = pd.DataFrame(qa_pairs)
qa_pairs_file_path = "improved_media_qa_pairs_test.csv"
qa_pairs_df.to_csv(qa_pairs_file_path, index=False)

print(f"\nGenerated QA pairs saved to {qa_pairs_file_path}")
print("\nGenerated QA pairs:")
print(qa_pairs_df.head())




Generated QA pairs saved to improved_media_qa_pairs_test.csv

Generated QA pairs:
                                             summary  \
0  Qatar Petroleum ( QP) is targeting aggressive ...   

                                            question  \
0  \n\nQatar Petroleum ( QP) is targeting aggress...   

                                              answer  
0  Qatar Petroleum ( QP) is targeting aggressive ...  


In [14]:
print(qa_pairs_df["question"][0])



Qatar Petroleum ( QP) is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion. The company is also aiming to reduce gas flaring intensity across its upstream facilities.
-------------------
 Generate only one question based on the above context and Just return the Question, nothing else.

- response: What are the key strategies Qatar Petroleum (QP) is implementing to aggressively cut greenhouse gas emissions and reduce gas flaring intensity in its Phase 2 LNG expansion and upstream facilities?
