# LangChain Workshop 2: Data Loaders and Output Parsers

In this notebook, we'll explore:
1. Loading data from interesting sources (YouTube, news, web pages)
2. Using output parsers to get structured responses
3. Building practical applications that combine both

In [None]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

load_dotenv()
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.3)

## Part 1: Data Loaders - Getting Data from the Wild

In [None]:
# Web page loader
from langchain_community.document_loaders import WebBaseLoader

# Load a Wikipedia page about something interesting
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Rubber_duck_debugging")
docs = loader.load()

print(f"Loaded {len(docs)} documents")
print(f"Content length: {len(docs[0].page_content)} characters")
print("\nFirst 300 characters:")
print(docs[0].page_content[:300] + "...")

In [None]:
# YouTube transcript loader (requires youtube-transcript-api)
from langchain_community.document_loaders import YoutubeLoader

# Load a programming tutorial or tech talk
youtube_loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Replace with actual tech video
    add_video_info=True
)

try:
    youtube_docs = youtube_loader.load()
    print(f"YouTube transcript loaded: {len(youtube_docs[0].page_content)} characters")
    print(youtube_docs[0].metadata)
except Exception as e:
    print(f"YouTube loading failed (this is common): {e}")
    print("We'll use web content instead for the exercises")

In [None]:
# RSS/News loader
from langchain_community.document_loaders import RSSFeedLoader

# Load recent tech news
rss_loader = RSSFeedLoader(
    urls=["https://feeds.ycombinator.com/news.rss"]
)

try:
    news_docs = rss_loader.load()
    print(f"Loaded {len(news_docs)} news articles")
    print("\nFirst article title:", news_docs[0].metadata.get('title', 'No title'))
    print("Summary:", news_docs[0].page_content[:200] + "...")
except Exception as e:
    print(f"RSS loading failed: {e}")
    print("Using fallback content...")

## Part 2: Output Parsers - Getting Structured Responses

In [None]:
from langchain.output_parsers import PydanticOutputParser, CommaSeparatedListOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List

In [None]:
# Simple list parser
list_parser = CommaSeparatedListOutputParser()

prompt = PromptTemplate(
    template="List 5 ridiculous programming language names that don't exist yet.\n{format_instructions}",
    input_variables=[],
    partial_variables={"format_instructions": list_parser.get_format_instructions()}
)

response = llm.invoke(prompt.format())
parsed_list = list_parser.parse(response.content)

print("Raw response:", response.content)
print("\nParsed list:", parsed_list)
print("Type:", type(parsed_list))

In [None]:
# Structured data with Pydantic
class MovieReview(BaseModel):
    title: str = Field(description="Movie title")
    rating: int = Field(description="Rating from 1-10")
    pros: List[str] = Field(description="List of positive aspects")
    cons: List[str] = Field(description="List of negative aspects")
    recommended: bool = Field(description="Whether you'd recommend it")

parser = PydanticOutputParser(pydantic_object=MovieReview)

prompt = PromptTemplate(
    template="Write a review for the movie '{movie}'.\n{format_instructions}",
    input_variables=["movie"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

response = llm.invoke(prompt.format(movie="The Matrix but everyone is a rubber duck"))
parsed_review = parser.parse(response.content)

print("Structured review:")
print(f"Title: {parsed_review.title}")
print(f"Rating: {parsed_review.rating}/10")
print(f"Pros: {parsed_review.pros}")
print(f"Cons: {parsed_review.cons}")
print(f"Recommended: {parsed_review.recommended}")

## Exercise 1: Bizarre News Summarizer

Create a system that loads weird Wikipedia pages and generates structured summaries with conspiracy-theory-style interpretations.

In [None]:
from pydantic import BaseModel, Field
from typing import List

# TODO: Define your conspiracy theory summary structure
class ConspiracySummary(BaseModel):
    # YOUR CODE HERE: Add fields for:
    # - title: str
    # - real_summary: str (actual factual summary)
    # - conspiracy_theory: str (humorous conspiracy interpretation)
    # - evidence_points: List[str] ("evidence" for the conspiracy)
    # - danger_level: int (1-10 scale)
    pass

# TODO: Create parser and prompt template

def analyze_weird_topic(wikipedia_url):
    """Load a Wikipedia page and generate a conspiracy analysis"""
    # YOUR CODE HERE:
    # 1. Load the webpage
    # 2. Use your structured parser to analyze it
    # 3. Return the parsed result
    pass

# Test with these weird Wikipedia topics:
weird_topics = [
    "https://en.wikipedia.org/wiki/Cargo_cult",
    "https://en.wikipedia.org/wiki/Kentucky_meat_shower",
    "https://en.wikipedia.org/wiki/Dancing_plague_of_1518"
]

# analyze_weird_topic(weird_topics[0])

### Solution:

In [None]:
class ConspiracySummary(BaseModel):
    title: str = Field(description="Topic title")
    real_summary: str = Field(description="Factual 2-sentence summary")
    conspiracy_theory: str = Field(description="Humorous conspiracy interpretation")
    evidence_points: List[str] = Field(description="List of 'evidence' supporting the conspiracy")
    danger_level: int = Field(description="Danger level from 1-10")

conspiracy_parser = PydanticOutputParser(pydantic_object=ConspiracySummary)

conspiracy_prompt = PromptTemplate(
    template="""
    Analyze this content and create both a factual summary and a humorous conspiracy theory interpretation:
    
    {content}
    
    {format_instructions}
    
    Make the conspiracy theory silly but creative. Include fake "evidence" points.
    """,
    input_variables=["content"],
    partial_variables={"format_instructions": conspiracy_parser.get_format_instructions()}
)

def analyze_weird_topic(wikipedia_url):
    loader = WebBaseLoader(wikipedia_url)
    docs = loader.load()
    
    # Use first 2000 characters to avoid token limits
    content = docs[0].page_content[:2000]
    
    response = llm.invoke(conspiracy_prompt.format(content=content))
    return conspiracy_parser.parse(response.content)

# Test it
result = analyze_weird_topic("https://en.wikipedia.org/wiki/Rubber_duck_debugging")
print(f"Title: {result.title}")
print(f"Real Summary: {result.real_summary}")
print(f"Conspiracy: {result.conspiracy_theory}")
print(f"Evidence: {result.evidence_points}")
print(f"Danger Level: {result.danger_level}/10")

## Exercise 2: Code Review Bot

Build a system that loads code from GitHub and provides structured reviews with personality ratings.

In [None]:
# TODO: Define code review structure
class CodeReview(BaseModel):
    # YOUR CODE HERE: Add fields for:
    # - overall_rating: int (1-10)
    # - readability: int (1-10) 
    # - bugs_found: List[str]
    # - improvements: List[str]
    # - programmer_personality: str (guess the programmer's personality)
    # - coffee_consumption_estimate: str (how much coffee they drink)
    pass

# Sample code snippets to analyze (you can use these or find GitHub links)
code_samples = {
    "messy_python": """
def calc(x,y,op):
    if op=='+':
        return x+y
    elif op=='-':
        return x-y
    elif op=='*':
        return x*y
    elif op=='/':
        if y!=0:
            return x/y
        else:
            return 'error'
    else:
        return 'invalid'
""",
    "over_engineered": """
class AbstractCalculatorFactoryInterface:
    def create_calculator(self):
        raise NotImplementedError

class CalculatorImplementation:
    def __init__(self, operation_strategy):
        self.strategy = operation_strategy
    
    def execute(self, x, y):
        return self.strategy.perform_operation(x, y)

class AdditionStrategy:
    def perform_operation(self, x, y):
        return x + y
"""
}

def review_code(code_snippet, code_name):
    """Analyze code and return structured review"""
    # YOUR CODE HERE:
    # 1. Create parser and prompt
    # 2. Analyze the code
    # 3. Return structured review with personality insights
    pass

# Test both code samples
# for name, code in code_samples.items():
#     print(f"\n=== Reviewing {name} ===")
#     review = review_code(code, name)
#     print(review)

### Solution:

In [None]:
class CodeReview(BaseModel):
    overall_rating: int = Field(description="Overall code quality rating 1-10")
    readability: int = Field(description="Code readability rating 1-10")
    bugs_found: List[str] = Field(description="List of potential bugs or issues")
    improvements: List[str] = Field(description="Suggested improvements")
    programmer_personality: str = Field(description="Guess about programmer's personality")
    coffee_consumption_estimate: str = Field(description="Estimated daily coffee consumption")

code_parser = PydanticOutputParser(pydantic_object=CodeReview)

code_prompt = PromptTemplate(
    template="""
    Review this code and provide analysis. Be thorough but humorous with personality assessment:
    
    Code name: {code_name}
    
    ```python
    {code}
    ```
    
    {format_instructions}
    
    Be creative with the personality and coffee estimates based on coding style.
    """,
    input_variables=["code", "code_name"],
    partial_variables={"format_instructions": code_parser.get_format_instructions()}
)

def review_code(code_snippet, code_name):
    response = llm.invoke(code_prompt.format(code=code_snippet, code_name=code_name))
    return code_parser.parse(response.content)

# Test both samples
for name, code in code_samples.items():
    print(f"\n=== Reviewing {name} ===")
    review = review_code(code, name)
    print(f"Rating: {review.overall_rating}/10")
    print(f"Readability: {review.readability}/10")
    print(f"Bugs: {review.bugs_found}")
    print(f"Improvements: {review.improvements}")
    print(f"Personality: {review.programmer_personality}")
    print(f"Coffee: {review.coffee_consumption_estimate}")

## Summary

**Data Loaders**: Get content from web pages, YouTube, RSS feeds, and more.

**Output Parsers**: Transform unstructured AI responses into structured data you can actually use in applications.

**Key Patterns**:
- Use Pydantic models to define your desired output structure
- Include format instructions in your prompts
- Handle loading errors gracefully
- Truncate content to avoid token limits

Next up: Chat models, agents, and vectorstores!