### Objective

In this project, we aim to use LLM to perform topic classification

In [1]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI, AzureOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import PyMuPDFLoader
from langchain.vectorstores import FAISS
from langchain.prompts import (
    ChatPromptTemplate, 
    MessagesPlaceholder, 
    SystemMessagePromptTemplate, 
    HumanMessagePromptTemplate
)
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import ConversationChain
from langchain.chat_models import AzureChatOpenAI
from langchain.memory import ConversationBufferMemory
import openai
import os

#### 1. Create document chunks

In [29]:
issue_name = 'ABB Review_03_2023_layout complete_EN_300dpi'
article_name = 'perfect_partners'
article_range = [41, 45]
loader = PyMuPDFLoader("./papers/"+issue_name+".pdf")
raw_documents = loader.load()[article_range[0]:article_range[-1]+1]

text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
documents = text_splitter.split_documents(raw_documents)

#### 2. Topic classification

In [30]:
# Available topics
topics = ['Tech and product insights', 'Market dynamics', 'Operational transformation', 
          'Sustainability initiatives', 'Customer experience', 'Industry challenges and opportunities', 
          'Strategic collaborations', 'Strategy innovation', 'General overview']

In [31]:
# Create LLM (completion)
llm = AzureOpenAI(
    deployment_name="deployment-5af509f3323342ee919481751c6f8b7d",
    model_name="text-davinci-003",
    openai_api_base="https://abb-chcrc.openai.azure.com/",
    openai_api_version="2023-03-15-preview",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    openai_api_type="azure",
)

In [32]:
template = """Given the following text, output which focal points from the following list are most relevant?.

        [text]: {text} \n
        [Focal points]: {topics}
        """

prompt = PromptTemplate(
    template=template,
    input_variables=["text", "topics"],
)

In [33]:
llm_response = []
for i, doc in enumerate(documents):
    if i%2==0:
        print(f"Processing {i+1}/{len(documents)}th docs.")
    response = llm.predict(prompt.format(text=doc.page_content, topics=topics))
    llm_response.append(response)

Processing 1/9th docs.
Processing 3/9th docs.
Processing 5/9th docs.
Processing 7/9th docs.
Processing 9/9th docs.


In [34]:
llm_response

['\nTech and product insights, Operational transformation, Sustainability initiatives, Strategic collaborations, and Industry challenges and opportunities.',
 '\nTech and product insights, Sustainability initiatives, Strategic collaborations, Strategy innovation',
 '\nTech and product insights, Sustainability initiatives, Strategic collaborations.',
 '\nTech and product insights\nMarket dynamics\nStrategic collaborations\nSustainability initiatives\nIndustry challenges and opportunities\nOperational transformation',
 '\nSustainability Initiatives, Strategic Collaborations, Strategy Innovation',
 '\nSustainability initiatives, Operational transformation, Tech and product insights, and Strategy innovation.',
 '\nTech and product insights, Sustainability initiatives, Strategic collaborations, and Strategy innovation.',
 '\nTech and product insights\nOperational transformation\nSustainability initiatives\nStrategic collaborations\nStrategy innovation',
 '\nSustainability initiatives, Strat

In [35]:
# Count 
topic_classifier = {}
for topic in topics[:-1]:
    topic_classifier[topic] = 0
    for response in llm_response:
        if topic in response:
            topic_classifier[topic] += 1

# Decide relevant topics
selected_topics = dict(sorted(topic_classifier.items(), key=lambda item: item[1], reverse=True))
selected_topics

{'Sustainability initiatives': 8,
 'Tech and product insights': 7,
 'Strategic collaborations': 7,
 'Operational transformation': 5,
 'Strategy innovation': 4,
 'Industry challenges and opportunities': 2,
 'Market dynamics': 1,
 'Customer experience': 0}