# Loading the Dataset

Load the dataset from the specified file path and inspect the first few entries:


In [1]:
import pandas as pd

In [2]:
# Load the dataset
file_path = "patent_preprocessed.csv"
patent_data = pd.read_csv(file_path)


In [3]:
print(patent_data.shape)
patent_data[["title", "abstract"]].head(10)

(8545, 12)


Unnamed: 0,title,abstract
0,Adaptable DC-AC Inverter Drive System and Oper...,Disclosed is an adaptable DC-AC inverter syste...
1,System for providing the energy from a single ...,"In accordance with an example embodiment, a so..."
2,Verfahren zum steuern einer windenergieanlage,Verfahren zum Steuern einer Windenergieanlage ...
3,Control method for optimizing solar-to-power e...,A control method for optimizing a solar-to-pow...
4,Mutually supporting hydropower systems,The mutually supporting hydropower systems inc...
5,System and Method to Drive Away Geese,A system and method for driving geese away fro...
6,Cladding sheet,"Cladding sheets (9), such as roof or wall clad..."
7,Hardened solar energy collector system,A hardened solar thermal energy collector (STE...
8,Systems and methods for hydro-based electric p...,A hydrodynamic power generation assembly and m...
9,Systems and methods for removing dust from sol...,Presented herein are systems and methods for w...


In [5]:
patent_data.head(4)

Unnamed: 0,publication_number,application_number,country_code,publication_date,title,abstract,inventors,code,inventive,first,title_preprocessed,abstract_preprocessed
0,US-2022239235-A1,US-202217717397-A,US,2022-07-28,Adaptable DC-AC Inverter Drive System and Oper...,Disclosed is an adaptable DC-AC inverter syste...,[],H02M7/5395,True,False,adaptable dc ac inverter drive system operation,disclose adaptable dc ac inverter system opera...
1,US-2022239251-A1,US-202217580956-A,US,2022-07-28,System for providing the energy from a single ...,"In accordance with an example embodiment, a so...",[],H02S40/38,True,False,system provide energy single contiguous solar ...,accordance example embodiment solar energy sys...
2,EP-4033090-A1,EP-21152924-A,EP,2022-07-27,Verfahren zum steuern einer windenergieanlage,Verfahren zum Steuern einer Windenergieanlage ...,"['Schaper, Ulf', 'von Aswege, Enno', 'Gerke Fu...",F03D7/0276,True,True,verfahren zum steuern einer windenergieanlage,verfahren zum steuern einer windenergieanlage ...
3,US-11396827-B2,US-202117606042-A,US,2022-07-26,Control method for optimizing solar-to-power e...,A control method for optimizing a solar-to-pow...,[],F24S50/00,True,False,control method optimize solar power efficiency...,control method optimize solar power efficiency...


### Filter out all all abstracts which are not english


In [6]:
import pandas as pd
from langdetect import detect, LangDetectException

# Assuming you have a DataFrame named patent_data
# patent_data = pd.read_csv('your_patent_data.csv')  # Load your DataFrame


def is_english(text):
    try:
        return detect(text) == "en"
    except LangDetectException:
        return False


# Apply the function to filter rows where abstracts are in English
patent_data_english = patent_data[patent_data["abstract"].apply(is_english)]

# Save the filtered DataFrame to a new CSV file
patent_data_english.to_csv("patent_data_english.csv", index=False)

In [7]:
# Display the filtered DataFrame
print("\nFiltered DataFrame (Only English Abstracts):")
print(patent_data_english.shape)
patent_data_english.head()


Filtered DataFrame (Only English Abstracts):
(4301, 12)


Unnamed: 0,publication_number,application_number,country_code,publication_date,title,abstract,inventors,code,inventive,first,title_preprocessed,abstract_preprocessed
0,US-2022239235-A1,US-202217717397-A,US,2022-07-28,Adaptable DC-AC Inverter Drive System and Oper...,Disclosed is an adaptable DC-AC inverter syste...,[],H02M7/5395,True,False,adaptable dc ac inverter drive system operation,disclose adaptable dc ac inverter system opera...
1,US-2022239251-A1,US-202217580956-A,US,2022-07-28,System for providing the energy from a single ...,"In accordance with an example embodiment, a so...",[],H02S40/38,True,False,system provide energy single contiguous solar ...,accordance example embodiment solar energy sys...
3,US-11396827-B2,US-202117606042-A,US,2022-07-26,Control method for optimizing solar-to-power e...,A control method for optimizing a solar-to-pow...,[],F24S50/00,True,False,control method optimize solar power efficiency...,control method optimize solar power efficiency...
4,US-2022228549-A1,US-202117153853-A,US,2022-07-21,Mutually supporting hydropower systems,The mutually supporting hydropower systems inc...,"['CHEN, CHUN-HUEI']",F03B3/00,True,False,mutually support hydropower system,mutually support hydropower system include hyd...
5,US-2022225606-A1,US-202217714484-A,US,2022-07-21,System and Method to Drive Away Geese,A system and method for driving geese away fro...,"['KOVARIK, JOSEPH E.', 'FRANEK, JEFF']",A01M29/10,True,True,system method drive away geese,system method drive geese away area employ pre...


### Summarize the abstracts


In [35]:
from transformers import pipeline
import tqdm

# Load your dataset
patent_data_english = pd.read_csv("patent_data_english.csv")

# Initialize the summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")


# Function to summarize a batch of texts
def summarize_batch(texts, batch_size=8):
    summaries = []
    for i in tqdm.tqdm(range(0, len(texts), batch_size)):
        batch = texts[i: i + batch_size]
        try:
            # Ensure texts are not too long for the summarizer
            batch = [text[:1000] for text in batch]
            batch_summaries = summarizer(
                batch, max_length=130, min_length=30, do_sample=False
            )
            summaries.extend([summary["summary_text"]
                             for summary in batch_summaries])
        except Exception as e:
            summaries.extend(
                [""] * len(batch)
            )  # Extend with empty summaries in case of an error
            print(f"Error in batch {i // batch_size}: {e}")
    return summaries


# Process a single chunk of the data
chunk_size = 50  # Define the size of the chunk
data_chunk = media_data.head(chunk_size).copy()  # Get the first chunk of data

# Summarize the content column of the data chunk
summaries = summarize_batch(data_chunk["content"].tolist())

# Apply the summaries to the DataFrame
data_chunk.loc[:, "summary"] = summaries

# Display the first few rows of the data chunk with the summaries
print(data_chunk[["title", "summary"]].head())

100%|██████████| 7/7 [04:58<00:00, 42.71s/it]

                                               title  \
0  Qatar to Slash Emissions as LNG Expansion Adva...   
1               India Launches Its First 700 MW PHWR   
2              New Chapter for US-China Energy Trade   
3  Japan: Slow Restarts Cast Doubt on 2030 Energy...   
4     NYC Pension Funds to Divest Fossil Fuel Shares   

                                             summary  
0  Qatar Petroleum ( QP) is targeting aggressive ...  
1  Kakrapar-3 is the first of India's 700 megawat...  
2  New US President Joe Biden took office this we...  
3  The slow pace of Japanese reactor restarts con...  
4  Two of New York City's largest pension funds s...  





In [36]:
# check the summary
data_chunk[["title", "summary"]].head(50)

Unnamed: 0,title,summary
0,Qatar to Slash Emissions as LNG Expansion Adva...,Qatar Petroleum ( QP) is targeting aggressive ...
1,India Launches Its First 700 MW PHWR,Kakrapar-3 is the first of India's 700 megawat...
2,New Chapter for US-China Energy Trade,New US President Joe Biden took office this we...
3,Japan: Slow Restarts Cast Doubt on 2030 Energy...,The slow pace of Japanese reactor restarts con...
4,NYC Pension Funds to Divest Fossil Fuel Shares,Two of New York City's largest pension funds s...
5,Japan: Supreme Court Will Likely Decide on Fuk...,Japan's Supreme Court will likely become the a...
6,Biden Appointees Signal Progressive Engagement,Oil and natural gas industry officials have be...
7,The Big Picture: The New 'Great Game ',Low-carbon energy race will be at the center a...
8,Japan: Tritium Release Plans at Fukushima On Hold,Plans to deal with more than 1 million tons of...
9,United States: Cold Snap Highlights Electrific...,The coldest weather in a generation brought wi...


In [5]:
print(data_chunk["summary"][0])

Qatar Petroleum ( QP) is targeting aggressive cuts in its greenhouse gas emissions as it prepares to launch Phase 2 of its planned 48 million ton per year LNG expansion. The company is also aiming to reduce gas flaring intensity across its upstream facilities.


In [4]:
print(data_chunk["summary"][1])

Kakrapar-3 is the first of India's 700 megawatt indigenously developed pressurized heavy water reactors ( PHWRs) to reach this milestone. 15 more units of the same design will follow.


In [None]:
# Import necessary libraries
from transformers import T5ForConditionalGeneration, T5Tokenizer
import pandas as pd
import os
from tqdm import tqdm

# Load the extracted summaries
summary_file_path = 'output/cleantech_media_with_textrank_summaries.csv'
media_summaries_df = pd.read_csv(summary_file_path)
print("\nLoaded extracted summaries:")
print(media_summaries_df.head())

# Initialize the T5 model and tokenizer
model_name = "valhalla/t5-small-qg-hl"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


def generate_question(context, max_length=50):
    input_text = f"generate question: {context}"
    input_ids = tokenizer.encode(
        input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(input_ids, max_new_tokens=max_length)
    question = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return question


# Generate QA pairs with batch processing and progress logging
batch_size = 100
qa_pairs = []

for i in tqdm(range(0, len(media_summaries_df), batch_size)):
    batch_df = media_summaries_df.iloc[i:i+batch_size]
    for index, row in batch_df.iterrows():
        summary = row['summary']
        sentences = summary.split('. ')
        # Limit the number of sentences processed
        for sentence in sentences[:5]:
            try:
                question = generate_question(sentence)
                qa_pairs.append({
                    'summary': summary,
                    'question': question,
                    'answer': sentence
                })
            except Exception as e:
                print(f"Error processing summary: {summary}")
                print(e)
                continue

# Ensure the save directory exists
save_dir = 'output/'
os.makedirs(save_dir, exist_ok=True)

# Save the generated QA pairs
qa_pairs_df = pd.DataFrame(qa_pairs)
qa_pairs_file_path = os.path.join(save_dir, 'improved_media_qa_pairs.csv')
qa_pairs_df.to_csv(qa_pairs_file_path, index=False)

print(f"\nGenerated QA pairs saved to {qa_pairs_file_path}")
print("\nGenerated QA pairs:")
print(qa_pairs_df.head())