# AK-Tag
Code for the retrieval of the "Stellungnahmen" (=Feedback)

This tutorial is following the documentation provided by the framework "LangChain" (https://python.langchain.com)

In [9]:
# install dependencies
%pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma bs4 pypdf progressbar2 unstructured
%pip install -qU langchain-openai

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import getpass
import os
import dotenv

dotenv.load_dotenv()
openai_api_key = os.getenv('openai_api_key')

import bs4
from langchain import hub
from langchain_community.document_loaders import PyPDFLoader
from langchain_chroma import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import TextLoader
import os
from openai import OpenAI
import progressbar


In [8]:
def convert_pdfs_to_pages(folder_path):
    """
    Convert PDF files in a specified folder to text.

    Args:
        folder_path (str): The folder path where the PDF files are located.

    Returns:
        list: A list of pages for each PDF file.
    """
    # Get the list of PDF files in the folder
    pdf_files = [file for file in os.listdir(folder_path) if file.endswith(".pdf")]

    pdf_list = []

    # Iterate over each PDF file and convert it to pages
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        pdf_list.append(pages)
        #print(f"Converted {pdf_file} to {len(pages)} pages.")

    return pdf_list
pdf_list  = convert_pdfs_to_pages("downloaded_pdfs")

In [13]:
def save_pdf_text(pdf_list):
    """
    Save the text of each PDF (multiple pages per PDF) in a separate file.

    Args:
        pdf_list (list): A list of PDF objects.

    Returns:
        None
    """
    # Create the folder if it doesn't exist
    folder_name = "text_per_pdf"
    os.makedirs(folder_name, exist_ok=True)

    # Save the text of each PDF in a separate file
    for i, pdf in enumerate(pdf_list):
        pdf_name = pdf[0].metadata['source']
        # Extracting the second half of the text
        second_half = pdf_name.split("/")[1]
        # Replacing spaces with underscores
        pdf_name_shortened = second_half.replace(' ', '_')

        feedback_text = []
        for j, page in enumerate(pdf):
            feedback_text.append(page.page_content)
            #print(page.page_content)   
        with open(f"{folder_name}/{pdf_name_shortened}.txt", "w") as f: 
            f.write("\n".join(feedback_text))
save_pdf_text(pdf_list)
        

In [None]:
llm = ChatOpenAI(api_key=openai_api_key,model="gpt-4-turbo-2024-04-09")

def summarize_with_gpt(text):
    prompt = ChatPromptTemplate.from_template("""Answer the following questions in German based only on the provided context:
    <context>
    {context}
    </context>

    Questions: {input}""")

    document_chain = create_stuff_documents_chain(llm, prompt)

    response = document_chain.invoke({
        "input": "Identify the key points raised by the stakeholder in their accompanying messages (and, if feasible, their attached documents) \
        Summarise the stakeholder feedback in bullets, grouping similar statements and highlighting divergent opinions \
        Cluster opinions according to positive and negative sentiment (supportive or against the proposed regulation) \
        Identify evidence from the inputs that can reinforce or contradict the proposed rules",
        "context": text
    })
    return(response)


def summarize_each_feedback(folder_path):

    # Create the folder if it doesn't exist
    folder_name = "summarization"
    os.makedirs(folder_name, exist_ok=True)

    text_files = [file for file in os.listdir(folder_path) if file.endswith(".txt")]
    print(text_files)
    for text_file in text_files:
        
        # Check if the summary file already exists
        summary_path = os.path.join(folder_name, text_file)
        if os.path.exists(summary_path):
            print(f"Skipping existing summary for {text_file}")
            
        text_path = os.path.join(folder_path, text_file)
        loader = TextLoader(text_path)
        text =  loader.load()
        summarized_text = summarize_with_gpt(text)
        file_name_without_extension = text_file.rsplit('.', 2)[0].rsplit('_', 1)[0]  # Remove the extension

        with open(f"{folder_name}/{text_file}", "w") as f: 
            f.write("Feedback from: "+file_name_without_extension + "\n" + "\n" + summarized_text)
          
summarize_each_feedback("text_per_pdf")

In [11]:
# Run chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains.llm import LLMChain
from langchain.chains.llm import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain
from langchain_community.document_loaders import DirectoryLoader

# Reduce
reduce_template = """The following is set of summaries:
{docs}
Take these and distill it into a final, consolidated summary.
Identify the key points raised by the stakeholders in their accompanying messages (and, if feasible, their attached documents)
Summarise the stakeholder feedback in bullets, grouping similar statements and highlighting divergent opinions
Cluster opinions according to positive and negative sentiment (supportive or against the proposed regulation)
Identify evidence from the inputs that can reinforce or contradict the proposed rules
Each stakeholder feedback starts with the line \"Feedback from: \". Underline the key aspects and provide examples from the provided summaries. Also, include the name of the stakeholder. List the names of very positive and very negative stakeholders.
Helpful Answer:"""

reduce_prompt = PromptTemplate.from_template(reduce_template)

reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)

# Takes a list of documents, combines them into a single string, and passes this to an LLMChain
combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="docs"
)

# Combines and iteratively reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
    # This is final chain that is called.
    combine_documents_chain=combine_documents_chain,
    # If documents exceed context for `StuffDocumentsChain`
    collapse_documents_chain=combine_documents_chain,
    # The maximum number of tokens to group documents into.
    token_max=128000,
)
loader = DirectoryLoader('./summarization/', glob="*.txt")
docs = loader.load()
meta_summarization = reduce_documents_chain.run(docs)

file_name = "./meta-summarization.txt"
with open(file_name, 'w') as file:
    file.write(meta_summarization)


## Option Batch upload:

In [52]:
categorize_system_prompt = '''
Identify the key points raised by the stakeholders in their accompanying messages (and, if feasible, their attached documents)
Summarise the stakeholder feedback in bullets, grouping similar statements and highlighting divergent opinions
Cluster opinions according to positive and negative sentiment (supportive or against the proposed regulation)
Identify evidence from the inputs that can reinforce or contradict the proposed rules

Each stakeholder feedback starts with the line "Feedback from: ".
'''

task = {
        "custom_id": f"task-generate-summary",
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4-turbo-2024-04-09",
            "temperature": 0.1,
            "response_format": { 
                "type": "json_object"
            },
            "messages": [
                {
                    "role": "system",
                    "content": categorize_system_prompt
                },
                {
                    "role": "user",
                    "content": all_text
                }
            ],
        }
    }

import json
file_name = "./summarization-batch-job.jsonl"
with open(file_name, 'w') as file:
    file.write(json.dumps(task) + '\n')