In [53]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
!pip install -r /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/stage3-requirements.txt
!pip install flash-attn
!pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes
!pip install langdetect==1.0.9

Collecting absl-py==2.1.0 (from -r /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/stage3-requirements.txt (line 1))
  Using cached absl_py-2.1.0-py3-none-any.whl (133 kB)
Collecting asttokens==2.4.1 (from -r /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/stage3-requirements.txt (line 3))
  Using cached asttokens-2.4.1-py2.py3-none-any.whl (27 kB)
Collecting breadability==0.1.20 (from -r /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/stage3-requirements.txt (line 5))
  Using cached breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama==0.4.6 (from -r /content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/stage3-requirements.txt (line 10))
  Using cached colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting comm==0.2.2 (from -r /content/drive/MyDriv

### QA Pair Generation


In [55]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


# Check if CUDA is available and set the device
device = 0 if torch.cuda.is_available() else -1
if device == 0:
    print("CUDA is available. Using GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available. Using CPU.")

CUDA is available. Using GPU: NVIDIA L4


In [56]:
# Load the summarized data
input_file = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_summaries.csv"
data_chunk = pd.read_csv(input_file)

# Display the first few rows to confirm the data structure
print("\nLoaded data with summaries:")
data_chunk[["summary"]].head()


Loaded data with summaries:


Unnamed: 0,summary
0,Disclosed is an adaptable DC-AC inverter syste...
1,A solar energy system comprises: a solar energ...
2,A control method for optimizing a solar-to-pow...
3,"The system includes a first hydropower system,..."
4,A system and method for driving geese away fro...


In [57]:
data_chunk.shape

(4301, 13)

In [58]:
# output_file = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs.csv"

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from accelerate import Accelerator
import torch
import os

# Initialize the model, tokenizer, and accelerator
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
accelerator = Accelerator(mixed_precision="fp16")
model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True, torch_dtype=torch.float16
)

# Move model to accelerator
model = model.to(accelerator.device)


def generate_questions(contexts, max_length=70):
    input_texts = [
        f"\n\n{context}\n-------------------\n Generate only one question based on the above context and Just return the Question, nothing else" for context in contexts]
    input_ids = tokenizer(input_texts, return_tensors="pt", padding=True,
                          truncation=True, max_length=512).input_ids.to(accelerator.device)

    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=max_length)
    questions = [tokenizer.decode(
        output, skip_special_tokens=True) for output in outputs]
    return questions


def process_data_in_parts(data, start_index=0, chunk_size=50):
    total_rows = len(data)
    end_index = min(start_index + chunk_size, total_rows)
    qa_pairs = []

    for index in range(start_index, end_index, 5):  # Process in smaller batches of 5
        batch_end_index = min(index + 5, end_index)
        summaries = data.iloc[index:batch_end_index]["summary"].tolist()
        questions = generate_questions(summaries)
        qa_pairs.extend([{"summary": summary, "question": question, "answer": summary}
                        for summary, question in zip(summaries, questions)])

    return qa_pairs, end_index


# Check if there's a partially processed file to resume from
output_file = "/content/drive/MyDrive/1-Working/1-HSLU-MscIDS/5-Projects/4-Semester/CLT/Samuel-CLT-Development/patent_qa_pairs.csv"
if os.path.exists(output_file):
    processed_df = pd.read_csv(output_file)
    start_index = len(processed_df)
else:
    processed_df = pd.DataFrame(columns=["summary", "question", "answer"])
    start_index = 0

# Process the data in parts
chunk_size = 50  # Define your chunk size
while start_index < len(data_chunk):
    qa_pairs, end_index = process_data_in_parts(
        data_chunk, start_index, chunk_size)
    qa_pairs_df = pd.DataFrame(qa_pairs)
    processed_df = pd.concat([processed_df, qa_pairs_df], ignore_index=True)
    processed_df.to_csv(output_file, index=False)
    start_index = end_index
    torch.cuda.empty_cache()  # Clear cache to free up memory

print(f"\nGenerated QA pairs saved to {output_file}")
print("\nSample Generated QA pairs:")
print(processed_df.head())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
print(qa_pairs_df["question"][0])

### EXTRACT QUESTION


In [48]:
import pandas as pd


# Load the generated QA pairs from the CSV file
qa_pairs_file_path = "datasets/patent_qa_pairs.csv"
qa_pairs_df = pd.read_csv(qa_pairs_file_path)

qa_pairs_df['question'].head()

0    \n\nDisclosed is an adaptable DC-AC inverter s...
1    \n\nA solar energy system comprises: a solar e...
2    \n\nA control method for optimizing a solar-to...
3    \n\nThe system includes a first hydropower sys...
4    \n\nA system and method for driving geese away...
Name: question, dtype: object

In [49]:
print(qa_pairs_df["question"][0])



Disclosed is an adaptable DC-AC inverter system and its operation. System is suited for solar energy harvesting in grid-connected or off-grid modes of operation.
-------------------
 Generate only one question based on the above context and Just return the Question, nothing else.
 - Here is an example.  Question: What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?

- Response: What are the benefits of implementing an adaptive-controlled AC inverter system in solar energy harvesting for both grid-connected and off-grid


In [70]:
import pandas as pd
import re

# Function to extract the question from the response using multiple strategies


def extract_question(response):
    # Check for specific delimiters and extract text between pattern and '?'
    pattern_keywords = [
        "How",
        "What",
        "Why",
        "Explain",
        "Describe",
        "Which",
        "In what",
        "To what",
        "In which",
        "What are",
        "What is",
        "How does",
        "What role",
        "How is",
        "How can",
        "What benefits",
        "What advantages",
        "What impact",
        "What function",
        "What effect",
        "How will",
        "What purpose",
        "What contribution",
        "Why does",
        "Explain the impact",
        "How do",
    ]
    for keyword in pattern_keywords:
        if f": {keyword}" in response:
            parts = response.split(f": {keyword}", 1)
            question_part = f"{keyword}{parts[1]}"
            question_parts = question_part.split("?")
            if len(question_parts) > 1:
                return question_parts[0].strip() + "?"
            return question_part.strip()

    # Use regular expressions to capture common question patterns
    pattern = re.compile(
        r"((How|What|Why|Explain|Describe|Which|In what|To what|In which|What are|What is|How does|What role|How is|How can|What benefits|What advantages|What impact|What function|What effect|How will|What purpose|What contribution|Why does|Explain the impact|How do).*?\?)",
        re.IGNORECASE,
    )
    match = pattern.search(response)
    if match:
        return match.group(0).strip()

    # If no specific pattern is found, return None
    return None


# Apply the extraction function to the DataFrame
qa_pairs_df["question_extract"] = qa_pairs_df["question"].apply(extract_question)

# Replace None with NaN
qa_pairs_df["question_extract"] = qa_pairs_df["question_extract"].replace(
    [None], [pd.NA]
)

# Save the DataFrame with the new column to a new CSV file in Google Drive
output_file_path = "patent_qa_pairs_with_extract.csv"
qa_pairs_df.to_csv(output_file_path, index=False)

# Display the first few rows of the DataFrame
print(qa_pairs_df[["question", "question_extract"]].head())
print(f"DataFrame saved to {output_file_path}")

                                            question  \
0  \n\nDisclosed is an adaptable DC-AC inverter s...   
1  \n\nA solar energy system comprises: a solar e...   
2  \n\nA control method for optimizing a solar-to...   
3  \n\nThe system includes a first hydropower sys...   
4  \n\nA system and method for driving geese away...   

                                    question_extract  
0  What are the advantages of the proposed adapti...  
1  What components are included in a typical sola...  
2  What control method is proposed for optimizing...  
3  How are the waterwheels and hoist devices in t...  
4  What is the principle behind the system that u...  
DataFrame saved to patent_qa_pairs_with_extract.csv


In [71]:
print(qa_pairs_df["question_extract"].shape)

print(qa_pairs_df[["question_extract"]].isnull().sum())

print(qa_pairs_df[["question_extract"]].head())

qa_pairs_df[["question_extract"]].tail()


(4301,)
question_extract    384
dtype: int64
                                    question_extract
0  What are the advantages of the proposed adapti...
1  What components are included in a typical sola...
2  What control method is proposed for optimizing...
3  How are the waterwheels and hoist devices in t...
4  What is the principle behind the system that u...


Unnamed: 0,question_extract
4296,What components and functionalities are integr...
4297,How does the adjustable solar panel orientatio...
4298,How does the disclosed utility model utilize a...
4299,How does the solar energy electroplax integrat...
4300,How does the wind power supply control method ...


In [75]:
print(qa_pairs_df["question_extract"][0])
print(qa_pairs_df["question_extract"][4299])


What are the advantages of the proposed adaptive-controlled AC inverter system for solar energy harvesting?
How does the solar energy electroplax integrated into the unmanned aerial vehicle hangar facilitate the charging process for the unmanned air vehicle?


In [84]:
# Drop NA in question_extract column and save to csv

qa_pairs_df = qa_pairs_df.dropna(subset=["question_extract"])

qa_pairs_df = qa_pairs_df[["summary", "question_extract"]]

# rename question_extract to question and summary to answer
qa_pairs_df = qa_pairs_df.rename(
    columns={"question_extract": "question", "summary": "answer"})

qa_pairs_df.to_csv("patent_qa_pairs_clean.csv", index=False)

In [85]:
print(qa_pairs_df.isnull().sum())
print(qa_pairs_df.shape)


answer      0
question    0
dtype: int64
(3917, 2)


In [86]:
qa_pairs_df.head()

Unnamed: 0,answer,question
0,Disclosed is an adaptable DC-AC inverter syste...,What are the advantages of the proposed adapti...
1,A solar energy system comprises: a solar energ...,What components are included in a typical sola...
2,A control method for optimizing a solar-to-pow...,What control method is proposed for optimizing...
3,"The system includes a first hydropower system,...",How are the waterwheels and hoist devices in t...
4,A system and method for driving geese away fro...,What is the principle behind the system that u...
