# LangChain

In [1]:
! pip list | grep langchain

langchain                     0.0.274


In [2]:
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(model="gpt-3.5-turbo")

In [8]:
response = chat([HumanMessage(content="Translate this sentence from English to French: I love programming.")])
response.content

"J'adore programmer."

In [11]:
response.to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'messages', 'AIMessage'],
 'kwargs': {'content': "J'adore programmer.", 'additional_kwargs': {}}}

# Memory

Memory allows the AI to remember the context of human interactions. This memory is preserved by creating summaries.

In [15]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory()
memory.chat_memory.add_user_message("Hi! Nice to meet you, I'm Shem and I am a programer!")
memory.chat_memory.add_ai_message("Oh! Nice to meet you too, I'm Audrey and I also a programer!")

In [16]:
memory.load_memory_variables({}) # when you use Buffer Memory, memory will completly saved.

{'history': "Human: Hi! Nice to meet you, I'm Shem and I am a programer!\nAI: Oh! Nice to meet you too, I'm Audrey and I also a programer!"}

In [20]:
from langchain.memory import ConversationBufferWindowMemory

memory = ConversationBufferWindowMemory(k=1)
memory.chat_memory.add_user_message("Hi! Nice to meet you, I'm Shem and I am a programer!")
memory.chat_memory.add_ai_message("Oh! Nice to meet you too, I'm Audrey and I also a programer!")
memory.chat_memory.add_user_message("Hi! Nice to meet you, I'm Jack and I am a house keeper!")
memory.chat_memory.add_ai_message("Oh! Nice to meet you too, I'm Black and I also a house keeper!")
memory.load_memory_variables({}) # When you use a window buffer, the memory is saved based on the 'k' steps you've set.

{'history': "Human: Hi! Nice to meet you, I'm Jack and I am a house keeper!\nAI: Oh! Nice to meet you too, I'm Black and I also a house keeper!"}

In [32]:
from langchain.llms import OpenAI
from langchain.memory import ConversationSummaryMemory

llm = OpenAI(temperature=0)
memory = ConversationSummaryMemory(llm=llm, memory_key="chat_history", return_messages=True)
memory.save_context(inputs={"User":"Hi! Nice to meet you, I'm Shem and I am a programer!"}, outputs={"AI":"Oh! Nice to meet you too, I'm Audrey and I also a programer!"})

In [33]:
memory.load_memory_variables({})

{'chat_history': [SystemMessage(content='\nThe human introduces themselves as Shem, a programmer, and the AI introduces themselves as Audrey, also a programmer.', additional_kwargs={})]}

## Impletment chat with memory

Using the SummaryMemory we just created, we can ask the AI for the name we added to the memory.

In [34]:
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain

# LLM
llm = ChatOpenAI()

# Prompt 
prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            "You are a nice chatbot having a conversation with a human."
        ),
        # The `variable_name` here is what must align with memory
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template("{question}")
    ]
)

# Notice that we `return_messages=True` to fit into the MessagesPlaceholder
# Notice that `"chat_history"` aligns with the MessagesPlaceholder name
conversation = LLMChain(
    llm=llm,
    prompt=prompt,
    verbose=True,
    memory=memory
)

# Notice that we just pass in the `question` variables - `chat_history` gets populated by memory
conversation({"question": "Do you remember whats my name?"})



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: You are a nice chatbot having a conversation with a human.
System: 
The human introduces themselves as Shem, a programmer, and the AI introduces themselves as Audrey, also a programmer.
Human: Do you remember whats my name?[0m

[1m> Finished chain.[0m


{'question': 'Do you remember whats my name?',
 'chat_history': [SystemMessage(content='\nThe human introduces themselves as Shem, a programmer, and the AI introduces themselves as Audrey, also a programmer.', additional_kwargs={})],
 'text': "Yes, your name is Shem. It's nice to meet you, Shem! How can I assist you today?"}

# Retrieval-Arguments Generative, RAG

## Dataset

In [18]:
! curl -L -o ../data/TSMC2023Q1.zip "https://drive.google.com/uc?export=download&id=1Z_ww5ZASdIq0uZrg8jt_d7E_PzFQdYH-"


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:05 --:--:--     0 0    0     0      0      0 --:--:--  0:00:04 --:--:--     0
100 10.1M  100 10.1M    0     0  1474k      0  0:00:07  0:00:07 --:--:-- 10.8M


In [23]:
! cd ../data && unzip TSMC2023Q1.zip

Archive:  TSMC2023Q1.zip
  inflating: TSMC2023Q1/20230330 TSMC Citi.pdf  
  inflating: TSMC2023Q1/20230331 TSMC UBS.pdf  
  inflating: TSMC2023Q1/20230406 TSMC MS.pdf  
  inflating: TSMC2023Q1/20230410 TSMC DW.pdf  
  inflating: TSMC2023Q1/20230410 TSMC HTI.pdf  
  inflating: TSMC2023Q1/20230410 TSMC JPM.pdf  
  inflating: TSMC2023Q1/20230411 TSMC GS.pdf  
  inflating: TSMC2023Q1/20230412 TSMC CL.pdf  
  inflating: TSMC2023Q1/20230414 TSMC NMR.pdf  
  inflating: TSMC2023Q1/20230417 TSMC HSBC.pdf  


In [3]:
! pip list | grep pypdf

pypdf                         3.15.4


In [7]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader('../data/TSMC2023Q1/20230330 TSMC Citi.pdf')

In [8]:
pages = loader.load()
len(pages) # pages of the pdf

15

In [9]:
print(pages[0].page_content) # content of page 1

See Appendix A-1 for Analyst Certification, Important Disclosures and Research Analyst Affiliations.
Citi Research is a division of Citigroup Global Markets Inc. (the "Firm"), which does and seeks to do business with companies covered in its research 
reports. As a result, investors should be aware that the Firm may have a conflict of interest that could affect the objectivity of this report. Investors should 
consider this report as only a single factor in making their investment decision. Certain products (not inconsistent with the author's published research) are 
available only on Citi's portals.30 Mar 2023 07:47:50 ET │ 15 pages    TSMC (2330.TW)Assessing the Potential Upside from AI CITI'S TAKE AI is witnessing a potential “iPhone moment” and should be a L-T catalyst in 
the semi industry, despite still small contribution at current early stage. nVidia 
dominates the data center GPU market for now, while we expect to see more 
AI chips in future designed by hyperscalers, includin

## Splitting posts into chunk

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/shemyu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from langchain.text_splitter import NLTKTextSplitter
splitter = NLTKTextSplitter(chunk_size = 1000, chunk_overlap = 200) # 1000 per chunk and overlap with 20%
chunks = splitter.split_text(pages[0].page_content) # split page 1

In [12]:
len(chunks) # page 1 is split into 5 chunks

5

## Vector Database

In [13]:
! pip list | grep chromadb

chromadb                      0.4.7


In [14]:
from langchain.vectorstores import Chroma # Langchian imtegrated Chroma
from langchain.embeddings.openai import OpenAIEmbeddings # Encoder

embedding = OpenAIEmbeddings()

persist_directory = './test_chroma'
vectordb = Chroma.from_texts(
    texts=chunks, # the Chunks input
    embedding = embedding, # Embedding Engine
    persist_directory=persist_directory # Storage placed
)

## Retrieval by AI

In [15]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)
result = qa_chain({"query": '#zh-tw 請問citi 對台積電的看法? 請用正體中文'})

In [16]:
result

{'query': '#zh-tw 請問citi 對台積電的看法? 請用正體中文',
 'result': 'Citi對台積電的看法是積極的。他們預期台積電在人工智慧和高性能運算方面的項目將帶來更好的2024年展望。他們重申買入評級，目標價維持在NT$620。他們認為台積電在人工智慧領域有著巨大的潛力，儘管目前貢獻仍然較小。'}

In [93]:
from langchain.prompts import PromptTemplate


# LLM
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

# Prompt 
message_template = """你是一名專業投資理財顧問，
請依據財報內容給予專業的理財建議。
給出的財報內容分為三大段落：
1. 你的文字建議，文字字數請控制在40字內
2. 第二部分重點財報指標，請整理成視覺化的表格

現在請用正體中文回應我的問題：{question}
"""

prompt = PromptTemplate(
    template=message_template,
    input_variables=["question"],
)

In [94]:
# chain_type_kwargs = {"prompt": prompt}
retriever = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectordb.as_retriever(),
    # chain_type_kwargs=chain_type_kwargs,
)

In [95]:
query_str = message_template.format(question="請問citi 對台積電的看法? 請用正體中文")
result = retriever({"query": query_str})

In [96]:
print(result["result"])

1. 建議：Citi對台積電的看法樂觀，預期其2024年表現將因AI和HPC的穩定產品線而改善，並重申買入評級，目標價格為NT$620。

2. 財報指標視覺化表格：

| 年份 | 淨利潤 (NT$M) | 每股盈餘 (NT$) | EPS成長 (%) | P/E | P/B | ROE (%) | 殖利率 (%) |
|------|--------------|--------------|-------------|-----|-----|---------|------------|
| 2021A | 596,540 | 23.00 | 15.2 | 23.3 | 6.4 | 29.7 | 2.1 |
| 2022A | 1,016,530 | 39.20 | 70.4 | 13.6 | 4.7 | 39.6 | 2.1 |
| 2023E | 872,096 | 33.63 | -14.2 | 15.9 | 3.9 | 26.9 | 2.2 |
| 2024E | 1,019,383 | 39.31 | 16.9 | 13.6 | 3.3 | 26.3 | 2.3 |
| 2025E | 1,276,397 | 49.22 | 25.2 | 10.9 | 2.7 | 27.2 | 2.4 |


| 年份 | 淨利潤 (NT$M) | 每股盈餘 (NT$) | EPS成長 (%) | P/E | P/B | ROE (%) | 殖利率 (%) |
|------|--------------|--------------|-------------|-----|-----|---------|------------|
| 2021A | 596,540 | 23.00 | 15.2 | 23.3 | 6.4 | 29.7 | 2.1 |
| 2022A | 1,016,530 | 39.20 | 70.4 | 13.6 | 4.7 | 39.6 | 2.1 |
| 2023E | 872,096 | 33.63 | -14.2 | 15.9 | 3.9 | 26.9 | 2.2 |
| 2024E | 1,019,383 | 39.31 | 16.9 | 13.6 | 3.3 | 26.3 | 2.3 |
| 2025E | 1,276,397 | 49.22 | 25.2 | 10.9 | 2.7 | 27.2 | 2.4 |