# Loading the Dataset

Load the dataset from the specified file path and inspect the first few entries:

In [35]:
import pandas as pd
from transformers import pipeline
import tqdm

# Load your dataset
media_data = pd.read_csv("cleantech_media_dataset_stage3.csv")

# Initialize the summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


# Function to summarize a batch of texts
def summarize_batch(texts, batch_size=8):
    summaries = []
    for i in tqdm.tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        try:
            # Ensure texts are not too long for the summarizer
            batch = [text[:1000] for text in batch]
            batch_summaries = summarizer(
                batch, max_length=130, min_length=30, do_sample=False
            )
            summaries.extend([summary["summary_text"] for summary in batch_summaries])
        except Exception as e:
            summaries.extend(
                [""] * len(batch)
            )  # Extend with empty summaries in case of an error
            print(f"Error in batch {i // batch_size}: {e}")
    return summaries


# Process a single chunk of the data
chunk_size = 50  # Define the size of the chunk
data_chunk = media_data.head(chunk_size).copy()  # Get the first chunk of data

# Summarize the content column of the data chunk
summaries = summarize_batch(data_chunk["content"].tolist())

# Apply the summaries to the DataFrame
data_chunk.loc[:, "summary"] = summaries

# Display the first few rows of the data chunk with the summaries
print(data_chunk[["title", "summary"]].head())

100%|██████████| 7/7 [04:58<00:00, 42.71s/it]

                                               title  \
0  Qatar to Slash Emissions as LNG Expansion Adva...   
1               India Launches Its First 700 MW PHWR   
2              New Chapter for US-China Energy Trade   
3  Japan: Slow Restarts Cast Doubt on 2030 Energy...   
4     NYC Pension Funds to Divest Fossil Fuel Shares   

                                             summary  
0  Qatar Petroleum ( QP) is targeting aggressive ...  
1  Kakrapar-3 is the first of India's 700 megawat...  
2  New US President Joe Biden took office this we...  
3  The slow pace of Japanese reactor restarts con...  
4  Two of New York City's largest pension funds s...  





In [36]:
# check the summary
data_chunk[["title", "summary"]].head(50)

Unnamed: 0,title,summary
0,Qatar to Slash Emissions as LNG Expansion Adva...,Qatar Petroleum ( QP) is targeting aggressive ...
1,India Launches Its First 700 MW PHWR,Kakrapar-3 is the first of India's 700 megawat...
2,New Chapter for US-China Energy Trade,New US President Joe Biden took office this we...
3,Japan: Slow Restarts Cast Doubt on 2030 Energy...,The slow pace of Japanese reactor restarts con...
4,NYC Pension Funds to Divest Fossil Fuel Shares,Two of New York City's largest pension funds s...
5,Japan: Supreme Court Will Likely Decide on Fuk...,Japan's Supreme Court will likely become the a...
6,Biden Appointees Signal Progressive Engagement,Oil and natural gas industry officials have be...
7,The Big Picture: The New 'Great Game ',Low-carbon energy race will be at the center a...
8,Japan: Tritium Release Plans at Fukushima On Hold,Plans to deal with more than 1 million tons of...
9,United States: Cold Snap Highlights Electrific...,The coldest weather in a generation brought wi...


In [5]:
print(data_chunk["summary"][0])

Qatar Petroleum ( QP) is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion. The company is also aiming to reduce gas flaring intensity across its upstream facilities.


In [4]:
print(data_chunk["summary"][1])

Kakrapar-3 is the first of India's 700 megawatt indigenously developed pressurized heavy water reactors ( PHWRs) to reach this milestone. 15 more units of the same design will follow.


In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
# !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps xformers trl peft accelerate bitsandbytes

In [None]:
# from unsloth import FastLanguageModel
# import torch
# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
#     "unsloth/llama-2-7b-bnb-4bit",
#     "unsloth/llama-2-13b-bnb-4bit",
#     "unsloth/codellama-34b-bnb-4bit",
#     "unsloth/tinyllama-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit", # New Google 6 trillion tokens model 2.5x faster!
#     "unsloth/gemma-2b-bnb-4bit",
# ] # More models at https://huggingface.co/unsloth

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/mistral-7b-v0.3", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
#     # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
# )

RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx

In [None]:
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
#                       "gate_proj", "up_proj", "down_proj",],
#     lora_alpha = 16,
#     lora_dropout = 0, # Supports any, but = 0 is optimized
#     bias = "none",    # Supports any, but = "none" is optimized
#     # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
#     use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
#     random_state = 3407,
#     use_rslora = False,  # We support rank stabilized LoRA
#     loftq_config = None, # And LoftQ
# )

In [None]:

# # Load the dataset with TextRank summaries
# summary_file_path = '/content/drive/MyDrive/CLT/cleantech_media_with_textrank_summaries.csv'
# media_data = pd.read_csv(summary_file_path)

# # Function to generate a question and an answer for a given summary
# def generate_qa(summary):
#     if not summary or pd.isnull(summary):
#         return {"question": "No content available", "answer": "No content available"}

#     try:
#         # Generate a question
#         prompt_for_question = f"Generate a question for the following summary:\n\n{summary}\n\nQuestion:"
#         question_inputs = tokenizer(prompt_for_question, return_tensors="pt", truncation=True, max_length=max_seq_length)
#         question_ids = model.generate(question_inputs.input_ids, max_length=50, num_beams=4, early_stopping=True)
#         question = tokenizer.decode(question_ids[0], skip_special_tokens=True)

#         # Generate an answer
#         prompt_for_answer = f"Answer the following question based on this summary:\n\nSummary: {summary}\n\nQuestion: {question}\n\nAnswer:"
#         answer_inputs = tokenizer(prompt_for_answer, return_tensors="pt", truncation=True, max_length=max_seq_length)
#         answer_ids = model.generate(answer_inputs.input_ids, max_length=50, num_beams=4, early_stopping=True)
#         answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

#         return {"question": question, "answer": answer}
#     except Exception as e:
#         return {"question": f"Error in generating question: {str(e)}", "answer": f"Error in generating answer: {str(e)}"}

# # Apply the QA generation to each summary in the dataset
# media_data['qa'] = media_data['summary'].apply(lambda x: generate_qa(x))

# # Extract the questions and answers into separate columns
# media_data['question'] = media_data['qa'].apply(lambda x: x['question'])
# media_data['answer'] = media_data['qa'].apply(lambda x: x['answer'])

# # Save the questions and answers to a new CSV file
# qa_file_path = '/content/drive/MyDrive/CLT/cleantech_media_with_qa.csv'
# media_data.to_csv(qa_file_path, index=False)
# print(f"Questions and answers saved to {qa_file_path}")

In [None]:
# Import necessary libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import os
from tqdm import tqdm

# Load the extracted summaries
summary_file_path = 'output/cleantech_media_with_textrank_summaries.csv'
media_summaries_df = pd.read_csv(summary_file_path)
print("\nLoaded extracted summaries:")
print(media_summaries_df.head())

# Initialize the T5 model and tokenizer
model_name = "valhalla/t5-small-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_question(context, max_length=50):
    input_text = f"generate question: {context}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_new_tokens=max_length)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question

# Generate QA pairs with batch processing and progress logging
batch_size = 100
qa_pairs = []

for i in tqdm(range(0, len(media_summaries_df), batch_size)):
    batch_df = media_summaries_df.iloc[i:i+batch_size]
    for index, row in batch_df.iterrows():
        summary = row['summary']
        sentences = summary.split('. ')
        for sentence in sentences[:5]:  # Limit the number of sentences processed
            try:
                question = generate_question(sentence)
                qa_pairs.append({
                    'summary': summary,
                    'question': question,
                    'answer': sentence
                })
            except Exception as e:
                print(f"Error processing summary: {summary}")
                print(e)
                continue

# Ensure the save directory exists
save_dir = 'output/'
os.makedirs(save_dir, exist_ok=True)

# Save the generated QA pairs
qa_pairs_df = pd.DataFrame(qa_pairs)
qa_pairs_file_path = os.path.join(save_dir, 'improved_media_qa_pairs.csv')
qa_pairs_df.to_csv(qa_pairs_file_path, index=False)

print(f"\nGenerated QA pairs saved to {qa_pairs_file_path}")
print("\nGenerated QA pairs:")
print(qa_pairs_df.head())