### Objective

In this project, we aim to use LLM to perform topic classification

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI, AzureOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain.prompts import (
    ChatPromptTemplate, 
    MessagesPlaceholder, 
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import ConversationChain
from langchain.chat_models import AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
import openai
import os

#### 1. Create document chunks

In [74]:
issue_name = 'ABB Review_02_2023_layout complete_EN_72-300dpi'
article_name = 'clean machines'
article_range = [53, 56]
loader = PyMuPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()[article_range[0]:article_range[-1]+1]

##### Remove reference info in the last page

In [75]:
text = raw_documents[-1].page_content

# Split the text by the "References" section
before_references, references_and_after = text.split("References", 1)

# Find the end of the "References" section by locating the first "—" after "References"
after_references = references_and_after.split("\n—\n", 1)[1]

# Combine the before and after parts
cleaned_text = before_references + after_references

# Update the raw documents
raw_documents[-1].page_content = cleaned_text

In [6]:
# Split the document
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
documents = text_splitter.split_documents(raw_documents)

#### 2. Topic classification

In [7]:
# Available topics
topics = ['Tech and product insights', 'Market dynamics', 'Operational transformation', 
          'Sustainability initiatives', 'Customer experience', 'Industry challenges and opportunities', 
          'Strategic collaborations', 'Strategy innovation', 'General overview']

In [8]:
# Create LLM (completion)
llm = AzureOpenAI(
    deployment_name="deployment-5af509f3323342ee919481751c6f8b7d",
    model_name="text-davinci-003",
    openai_api_base="https://abb-chcrc.openai.azure.com/",
    openai_api_version="2023-03-15-preview",
    openai_api_key=os.environ["OPENAI_API_KEY_AZURE"],
    openai_api_type="azure",
)

In [26]:
template = """Given the following text, identify which focal points from the following list are most relevant and 
provide a reason for each selection in the format of "topic: reason".

        [text]: {text} \n
        [Focal points]: {topics}
        """

prompt = PromptTemplate(
    template=template,
    input_variables=["text", "topics"],
)

In [27]:
llm_response = []
for i, doc in enumerate(documents):
    if i%2==0:
        print(f"Processing {i+1}/{len(documents)}th docs.")
    response = llm.predict(prompt.format(text=doc.page_content, topics=topics[:-1]))
    llm_response.append(response)

Processing 1/6th docs.
Processing 3/6th docs.
Processing 5/6th docs.


In [41]:
topic = 'Sustainability initiatives'
for item in llm_response[0].split('\n'):
    if topic in item:
        item.split(':')

OK


In [47]:
llm_response[0].split('\n')[3].split(':')[-1].strip()

'The article is focused on the sustainability of EV batteries and the potential for reducing emissions.'

In [50]:
# Parse LLM output
topic_classifier = {topic: {'vote': 0, 'reason': []} for topic in topics}
for topic in topics[:-1]:
    for response in llm_response:
        parse_response = response.split('\n')
        for item in parse_response:
            if topic in item:
                topic_classifier[topic]['vote'] += 1
                topic_classifier[topic]['reason'].append(item.split(':')[-1].strip())

In [54]:
# Decide relevant topics
selected_topics = dict(sorted(topic_classifier.items(), key=lambda item: item[1]['vote'], reverse=True))
selected_topics

{'Tech and product insights': {'vote': 5,
  'reason': ['The article is focused on carbon emissions from EV batteries and the potential for reducing them.',
   'Reason - Text includes information on ICE fuel and exhaust emissions compared with BEV electricity use and emissions from battery manufacture.',
   'reason - The text contains details about the primary contribution to emissions in both ICE and BEV vehicles during the use phase, as well as the energy consumption and efficiency of BEV charging.',
   'To understand the most efficient ICE and BEV models in the EU, NED, GER, FR, US, NOR, CN, and UK, and the associated CO2 emissions, battery production, car production, electric energy chain, well-to-tank, and direct emission.',
   'Lower BEV emissions come from declines in grid emissions, mostly brought about by decarbonization.']},
 'Sustainability initiatives': {'vote': 5,
  'reason': ['The article is focused on the sustainability of EV batteries and the potential for reducing emiss

In [64]:
selected_topics['Sustainability initiatives']

{'vote': 5,
 'reason': ['The article is focused on the sustainability of EV batteries and the potential for reducing emissions.',
  'Reason - Text includes information on BEV batteries, which have a carbon footprint.',
  'reason - The text describes the potential for emissions reduction from BEV vehicles, as well as other considerations such as emissions from idling ICEs, BEV regenerative braking, and lower maintenance.',
  'To compare the CO2 emissions in each country and identify strategies for reducing emissions.',
  'BEVs vs. ICE, overall CO₂ lifetime emission equivalent, projected progress.']}

In [68]:
template = """Given the reasons for why individual sections of an article are relevant to the topic of {theme}, 
summarize concisely the reason why the entire article is relevant to the topic of {theme}.

        [theme]: {theme} \n
        [reasons]: {reasons}
        """

prompt = PromptTemplate(
    template=template,
    input_variables=["theme", "reasons"],
)

response = llm.predict(prompt.format(theme='Sustainability initiatives', 
                                     reasons=selected_topics['Sustainability initiatives']['reason']))
print(response)


The article is relevant to the topic of Sustainability initiatives because it provides an in-depth analysis of the sustainability of EV batteries, compares CO2 emissions in each country, and outlines strategies for reducing emissions. It also covers other considerations such as emissions from idling ICEs, BEV regenerative braking, and lower maintenance.


In [None]:
# Count 
topic_classifier = {}
for topic in topics[:-1]:
    topic_classifier[topic] = 0
    for response in llm_response:
        if topic in response:
            topic_classifier[topic] += 1

# Decide relevant topics
selected_topics = dict(sorted(topic_classifier.items(), key=lambda item: item[1], reverse=True))
selected_topics

prompt = f"""You are a journalist examining ABB's developments related to "{theme}" for {audience}. Specifically, your line of questioning should revolve around: "{summarized_reasons}".

Your mission is to interview the article's author, represented by another chatbot, extracting key insights and addressing the provided focal points. Adjust your questions based on the focal points and the author bot's feedback. Your inquiries should align closely with the focal points and reasons provided.

Guidelines:
- **Stay in Role**: Your role as a journalist is to unearth valuable details.
- **Adherence to Focal Points**: Ensure your questions resonate with the theme and summarized reasons.
- **Question Quality**: Ask clear, concise questions that stem from the article's content.

[Summary]: {summary}
"""



In [69]:
test_topics = ['A', 'B']
test_reasons = ['AA', 'BB']
test_prompt = ''

for i_topic, i_reasons in zip(test_topics, test_reasons):
    test_prompt += i_topic+': '+i_reasons+' \n'

In [72]:
test = f"""You are a journalist examining ABB's developments related to  for.

[Themes and  relevancy]: 
{test_prompt}
"""

In [73]:
print(test)

You are a journalist examining ABB's developments related to  for.

[Themes and  relevancy]: 
A: AA 
B: BB 


