In [4]:
from dotenv import load_dotenv
load_dotenv()

import os
os.environ['http_proxy'] = os.getenv("http_proxy")
os.environ['https_proxy'] = os.getenv("https_proxy")

os.environ["LANGCHAIN_TRACING_V2"] = os.getenv("LANGCHAIN_TRACING_V2")
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

os.environ["DASHSCOPE_API_KEY"] = os.getenv("DASHSCOPE_API_KEY")

# 1. StructuredOutputParser

## 1.1 案例一

In [5]:
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate

# 告诉他我们生成的内容需要哪些字段，每个字段类型式啥
response_schemas = [
    ResponseSchema(type="string", name="bad_string", description="This a poorly formatted user input string"),
    ResponseSchema(type="string", name="good_string", description="This is your response, a reformatted response")
]

# 初始化解析器
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# 生成的格式提示符
format_instructions = output_parser.get_format_instructions()
print(format_instructions)
print("="*200)

# 加入至template中
template = """
You will be given a poorly formatted string from a user.
Reformat it and make sure all the words are spelled correctly

{format_instructions}

% USER INPUT:
{user_input}

YOUR RESPONSE:
"""

# 将我们的格式描述嵌入到prompt中去，告诉llm我们需要他输出什么样格式的内容
prompt = PromptTemplate(
    input_variables=["user_input"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

# 假设一个提问，看看prompt是怎样的
promptValue = prompt.format(user_input="welcom to califonya!")
print(promptValue)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"bad_string": string  // This a poorly formatted user input string
	"good_string": string  // This is your response, a reformatted response
}
```

You will be given a poorly formatted string from a user.
Reformat it and make sure all the words are spelled correctly

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"bad_string": string  // This a poorly formatted user input string
	"good_string": string  // This is your response, a reformatted response
}
```

% USER INPUT:
welcom to califonya!

YOUR RESPONSE:



In [15]:
from langchain_community.llms import Tongyi

# 进行实际问答
llm = Tongyi(model = 'qwen-turbo')
chain = (
    prompt
    | llm
)
chain.invoke('well come to the califonya')

'```json\n{\n\t"bad_string": "well come to the califonya",\n\t"good_string": "Welcome to California"\n}\n```'

## 1.2 案例二

In [16]:
# -*- coding: utf-8 -*-
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
# from langchain.llms import OpenAI
from langchain_community.llms import Tongyi


llm = Tongyi(model = 'qwen-turbo')

# 告诉他我们生成的内容需要哪些字段，每个字段类型式啥
response_schemas = [
    ResponseSchema(type="number", name="number", description="文本中的数字"),
    ResponseSchema(type="string", name="people", description="文本中的人物"),
    ResponseSchema(type="string", name="place", description="文本中的地点"),
]

# 初始化解析器
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# 生成的格式提示符
format_instructions = output_parser.get_format_instructions()
print(format_instructions)
print("="*200)

template = """
给定下面的文本，找出特定的结构化信息。

{format_instructions}

% USER INPUT:
{user_input}

YOUR RESPONSE:
"""

# prompt
prompt = PromptTemplate(
    input_variables=["user_input"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

question = "张晓明今天在香港坐了2趟地铁。"
## 看看提示词是怎么样的
promptValue = prompt.format(user_input="张晓明今天在香港坐了2趟地铁。")
print(promptValue)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"number": number  // 文本中的数字
	"people": string  // 文本中的人物
	"place": string  // 文本中的地点
}
```

给定下面的文本，找出特定的结构化信息。

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"number": number  // 文本中的数字
	"people": string  // 文本中的人物
	"place": string  // 文本中的地点
}
```

% USER INPUT:
张晓明今天在香港坐了2趟地铁。

YOUR RESPONSE:



### 1.2.1 第一种链条输出方式

In [17]:
chain = (
    prompt | llm
)

print(chain.invoke({"user_input": question}))
print("="*200)
# 使用解析器进行解析生成的内容
print(output_parser.parse(chain.invoke({"user_input": question})))

```json
{
	"number": 2,
	"people": "张晓明",
	"place": "香港"
}
```
{'number': 2, 'people': '张晓明', 'place': '香港'}


### 1.2.2 第二种链条输出方式

In [18]:
# 在链条中使用解析器进行解析生成的内容
chain2 = (
    prompt | llm | output_parser
)

print(chain2.invoke({"user_input": question}))

{'number': 2, 'people': '张晓明', 'place': '香港'}


## 1.3 案例三

In [19]:
# -*- coding: utf-8 -*-
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain_community.llms import Tongyi

llm = Tongyi(model = 'qwen-turbo')

# 告诉他我们生成的内容需要哪些字段，每个字段类型式啥
response_schemas = [
    ResponseSchema(type="array", name="time", description="文本中的日期时间列表"),
    ResponseSchema(type="array", name="people", description="文本中的人物列表"),
    ResponseSchema(type="array", name="place", description="文本中的地点列表"),
    ResponseSchema(type="array", name="org", description="文本中的组织机构列表"),
]

# 初始化解析器
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# 生成的格式提示符
format_instructions = output_parser.get_format_instructions()
print(format_instructions)
print("="*200)

template = """
给定下面的文本，找出特定的实体信息，并以结构化数据格式返回。

{format_instructions}

% USER INPUT:
{user_input}

YOUR RESPONSE:
"""

question = "6月26日，广汽集团在科技日上首次公开展示飞行汽车项目，飞行汽车GOVE完成全球首飞。广汽研究院院长吴坚表示，GOVE可以垂直起降，并搭载双备份多旋翼飞行系统，保障飞行安全。"
# prompt
prompt = PromptTemplate(
    input_variables=["user_input"],
    partial_variables={"format_instructions": format_instructions},
    template=template
)

promptValue = prompt.format(user_input = question)
print(promptValue)
print("prompt:", "="*200)

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"time": array  // 文本中的日期时间列表
	"people": array  // 文本中的人物列表
	"place": array  // 文本中的地点列表
	"org": array  // 文本中的组织机构列表
}
```

给定下面的文本，找出特定的实体信息，并以结构化数据格式返回。

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"time": array  // 文本中的日期时间列表
	"people": array  // 文本中的人物列表
	"place": array  // 文本中的地点列表
	"org": array  // 文本中的组织机构列表
}
```

% USER INPUT:
6月26日，广汽集团在科技日上首次公开展示飞行汽车项目，飞行汽车GOVE完成全球首飞。广汽研究院院长吴坚表示，GOVE可以垂直起降，并搭载双备份多旋翼飞行系统，保障飞行安全。

YOUR RESPONSE:



In [20]:
# 在链条中使用解析器进行解析生成的内容
chain = (
    prompt | llm | output_parser
)

print(chain.invoke({"user_input": question}))

{'time': ['6月26日'], 'people': ['吴坚'], 'place': [], 'org': ['广汽集团', '广汽研究院']}


## 1.4 案例四

In [10]:
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatTongyi

# Define the schema for the expected output, including two fields: "recipe" and "ingredients"
response_schemas = [
    ResponseSchema(name="recipe", description="the recipe for the dish requested by the user"),
    ResponseSchema(
        name="ingredients",
        description="list of ingredients required for the recipe, should be a detailed list.",
    ),
]

# Create a StructuredOutputParser instance from the defined response schemas
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

# Generate format instructions based on the response schemas, which will be injected into the prompt
format_instructions = output_parser.get_format_instructions()

# Define the prompt template, instructing the model to provide the recipe and ingredients
prompt = PromptTemplate(
    template="Provide the recipe for the dish requested.\n{format_instructions}\n{dish}",
    input_variables=["dish"],
    partial_variables={"format_instructions": format_instructions},
)

# model = ChatOpenAI(model="gpt-4o-mini", temperature=0)
model = ChatTongyi(temperature=0)

# Create a chain that connects the prompt, model, and output parser
chain = prompt | model | output_parser

# The output will be structured according to the predefined schema with fields for "recipe" and "ingredients"
chain.invoke({"dish": "Spaghetti Bolognese"})

{'recipe': '1. Heat olive oil in a large pot over medium heat. Add chopped onions, carrots, and celery. Cook until softened.\n2. Add ground beef and cook until browned, stirring to break up the meat.\n3. Stir in garlic, tomato paste, canned tomatoes, red wine (optional), and herbs. Simmer for at least 1 hour, stirring occasionally.\n4. Cook spaghetti according to package instructions until al dente. Drain.\n5. Serve the sauce over the spaghetti and top with grated Parmesan cheese if desired.',
 'ingredients': '- 1 pound ground beef\n- 1 large onion, finely chopped\n- 2 carrots, finely chopped\n- 2 stalks celery, finely chopped\n- 3 cloves garlic, minced\n- 2 tablespoons olive oil\n- 2 tablespoons tomato paste\n- 1 can (28 ounces) crushed tomatoes\n- 1 cup red wine (optional)\n- 1 teaspoon dried oregano\n- 1 teaspoon dried basil\n- Salt and pepper to taste\n- 1 pound spaghetti\n- Grated Parmesan cheese (for serving)'}

# 2. With_structured_output

This class method takes an input schema to guide the LLM to generate specific responses.

You can only use this with LLMs that provide APIs for structuring outputs, such as tool calling or JSON mode (this means it only works for providers like OpenAI, Anthropic, Cohere, etc.).

If the model doesn’t natively support such features, you’ll need to use an output parser to extract the structured response from the model output.

You typically use with_structured_output to specify a particular format you want the LLM to use in its response, passing this format as schema into the prompt. ‍

**Benefits of with_structured_output**
- Type safety: The Pydantic model ensures that the output matches the expected types.
- Integration with LangChain’s runnables: This method wraps the LLM call in a runnable, allowing for easy chaining of operations.
- Flexibility: You can use various schema types, including TypeDict, JSON schema, or Pydantic classes.

**Limitations and Considerations**
- While powerful, with_structured_output has some limitations to keep in mind:
- LLM compatibility: It only works with LLMs that support structured output APIs.
- Complexity in advanced scenarios: When dealing with more complex workflows, the use of runnables can introduce additional complexity.
- Learning curve: Understanding LangChain’s expression language (LCEL) and runnables requires some additional learning.

In [22]:
# from langchain_core.pydantic_v1 import BaseModel, Field
from pydantic import BaseModel, Field
from langchain_community.chat_models import ChatTongyi
from langchain_core.prompts import ChatPromptTemplate


model = ChatTongyi(temperature=0)

class Trivia(BaseModel):
    question: str = Field(description="The trivia question")
    answer: str = Field(description="The correct answer to the trivia question")

# Define the prompt template
prompt = ChatPromptTemplate.from_template(
    "Give me a trivia question about {topic}, respond in JSON with `question` and `answer` keys"
)

print("Prompt:", prompt)
print("="*200)

# Create a structured LLM using the `with_structured_output` method
structured_llm = model.with_structured_output(Trivia)

# Chain the prompt and structured LLM using the pipe operator
trivia_chain = prompt | structured_llm

# Invoke the chain
result = trivia_chain.invoke({"topic": "music"})

print("Result:", result)
print("="*200)

Prompt: input_variables=['topic'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['topic'], input_types={}, partial_variables={}, template='Give me a trivia question about {topic}, respond in JSON with `question` and `answer` keys'), additional_kwargs={})]
Result: question='Who is the lead singer of the band Coldplay?' answer='Chris Martin'


In [23]:
# from langchain_core.pydantic_v1 import BaseModel, Field 
from pydantic import BaseModel, Field
from langchain_community.chat_models import ChatTongyi
from langchain_core. prompts import ChatPromptTemplate
# Define our WeatherForecast model
class WeatherForecast(BaseModel): 
    temperature: float = Field(description="The temperature in Celsius")
    condition: str = Field(description="The weather condition (e.g., sunny, rainy, cloudy)")
    humidity: int = Field(description="The humidity percentage")
    wind_speed: float = Field(description="The wind speed in km/h")

# Set up the LLM
model = ChatTongyi(temperature=0)

# Create the prompt template 
prompt = ChatPromptTemplate.from_template(
"Given the context below, provide a weather forecast for {city} tomorrow, respond in JSON with temperature`, `condition`, `humidity`, and `wind_speed` keys\n\n{context}"
)

# Apply structured output to the LLM
structured_llm = model.with_structured_output(WeatherForecast)

# Chain the prompt and structured LLM
weather_chain = prompt | structured_llm

# Generate a weather forecast
result = weather_chain.invoke({"city": "New York", "context": "The weather in New York will be sunny with a chance of rain."})

print(result)
# Output: WeatherForecast(temperature=22.5, condition=' Partly cloudy', humidity=65, wind_speed=10.2)

temperature=23.5 condition='sunny with a chance of rain' humidity=45 wind_speed=15.0


# 3. PydanticOutputParser

Not all models support tool calling and JSON mode, so another approach for getting structured outputs is to use an output parser like PydanticOutputParser to extract the needed information.

This parser works best in situations where type safety is an obvious concern, as it ensures that LLM responses adhere strictly to a Pydantic model schema.

This parser also implements the LangChain runnable interface.

Below, we use PydanticOutputParser to specify a Book schema to ensure the title in the LLM’s output is a string and the number of pages an integer: ‍

The PydanticOutputParser class is another tool in LangChain's arsenal for extracting structured information from LLM outputs.

It’s particularly useful when working with LLMs that don’t support native structured output features.

It allows you to enforce type safety by ensuring that the output conforms to a predefined Pydantic schema.

This parser works best when you need reliable type validation or when your LLM lacks structured output features.

**Benefits of PydanticOutputParser**
- Strict type enforcement: Ensures that the LLM output matches the expected data types.
- Flexible schema definition: Allows for complex nested structures and custom validation logic.
- Integration with LangChain’s runnable interface: Enables easy incorporation into LangChain workflows.
- Limitations and Considerations
- While powerful, PydanticOutputParser has some limitations to keep in mind:

**Overhead: Requires additional processing to parse the LLM output.**
- Prompt engineering: Needs careful prompt design to guide the LLM in producing the correct output format.
- Learning curve: Requires familiarity with Pydantic for defining schemas.
- The StructuredOutputParser Class
- The StructuredOutputParser class is a versatile tool in LangChain that allows you to extract structured information from LLM outputs using custom-defined schemas.

It works with diverse models and provides flexibility to define custom output structures through ResponseSchema objects.

This parser is particularly useful when you need to extract information that doesn’t fit neatly into pre-built schemas.

In [24]:
from typing import List 
from langchain_core.output_parsers import PydanticOutputParser 
from langchain_core.prompts import ChatPromptTemplate 
# from langchain_core.pydantic_v1 import BaseModel, Field  # LangChain *200官方案例是这样的
from pydantic import BaseModel, Field
from langchain_community.chat_models import ChatTongyi
# Define our Movie and FilmFestival models 

class Movie(BaseModel): 
    title: str = Field(..., description="The title of the movie")
    director: str = Field(..., description="The director of the movie")
    runtime: int = Field(..., description="The runtime of the movie in minutes")

class FilmFestival(BaseModel): 
    movies: List[ Movie]
    
# Set up the parser 
parser = PydanticOutputParser(pydantic_object=FilmFestival)

# Create the prompt template 
prompt = ChatPromptTemplate.from_messages([
    ("system", "Answer the user query about movies in the film festival. Wrap the output in `json` format following the schema below\n {format_instructions}"),
    ("human", "{query}"),
]).partial(format_instructions=parser.get_format_instructions())

print(prompt)
print("="*200)

# Set up the LLM and chain 
llm = ChatTongyi(temperature=0)
chain = prompt | llm | parser
# Generate movie information 
query = "Please provide details about the movies `Inception` directed by Christopher Nolanlwith a runtime of 148 minutes and `Parasite` directed by Bong Joon-ho with a runtime of 132 minutes."
result = chain.invoke({"query": query})
print(result)
# Expected output: FilmFestival(movies=[ Movie(title=' Inception', director=' Christopher Nolan', runtime=148), Movie(title=' Parasite', director=' Bong Joon-ho', runtime=132)])

input_variables=['query'] input_types={} partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"$defs": {"Movie": {"properties": {"title": {"description": "The title of the movie", "title": "Title", "type": "string"}, "director": {"description": "The director of the movie", "title": "Director", "type": "string"}, "runtime": {"description": "The runtime of the movie in minutes", "title": "Runtime", "type": "integer"}}, "required": ["title", "director", "runtime"], "title": "Movie", "type": "object"}}, "properties": {"movies": {"items": {"$ref": "#

In [25]:
from typing import List, Any
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain_community.output_parsers.pydantic import PydanticOutputParser
from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field


class Book(BaseModel):
    """Information about a book."""

    title: str = Field(..., description="The title of the book")
    pages: int = Field(
        ..., description="The number of pages in the book."
    )


class Library(BaseModel):
    """Details about all books in a collection."""

    books: List[Book]

parser = PydanticOutputParser(pydantic_object=Library)

# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user query. Wrap the output in `json` tags\n{format_instructions}",
        ),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

# Query
query = "Please provide details about the books 'The Great Gatsby' with 208 pages and 'To Kill a Mockingbird' with 384 pages."

# Print the prompt and output schema
print(prompt.invoke(query).to_string())

System: Answer the user query. Wrap the output in `json` tags
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"$defs": {"Book": {"description": "Information about a book.", "properties": {"title": {"description": "The title of the book", "title": "Title", "type": "string"}, "pages": {"description": "The number of pages in the book.", "title": "Pages", "type": "integer"}}, "required": ["title", "pages"], "title": "Book", "type": "object"}}, "description": "Details about all books in a collection.", "properties": {"books": {"items": {"$ref": "#/$defs/Book"}, "title": "Books", "type": "array"

In [27]:
llm = ChatTongyi(temperature=0)
chain = prompt | llm | parser
# Generate movie information 
result = chain.invoke({"query": query})
print(result)

books=[Book(title='The Great Gatsby', pages=208), Book(title='To Kill a Mockingbird', pages=384)]


In [28]:
from typing import List
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field  # 确保是从 pydantic 导入而不是 langchain_core.pydantic_v1
from langchain_community.chat_models import ChatTongyi

# 定义我们的 Movie 和 FilmFestival 模型
class Movie(BaseModel):
    title: str = Field(..., description="电影的标题")
    director: str = Field(..., description="电影的导演")
    runtime: int = Field(..., description="电影的时长（分钟）")

class FilmFestival(BaseModel):
    movies: List[Movie]

# 设置解析器
parser = PydanticOutputParser(pydantic_object=FilmFestival)

# 创建提示模板
prompt = ChatPromptTemplate.from_messages([
    ("system", "回答用户关于电影节中电影的问题。按照下面的模式将输出包裹在 `json` 格式中\n {format_instructions}"),
    ("human", "{query}"),
]).partial(format_instructions=parser.get_format_instructions())

# 设置 LLM 和链
llm = ChatTongyi(temperature=0)
chain = prompt | llm | parser

# 生成电影信息
query = "请提供由克里斯托弗·诺兰执导、片长148分钟的《盗梦空间》和由奉俊昊执导、片长132分钟的《寄生虫》的详细信息。"
result = chain.invoke({"query": query})
print(result)

movies=[Movie(title='盗梦空间', director='克里斯托弗·诺兰', runtime=148), Movie(title='寄生虫', director='奉俊昊', runtime=132)]
