# LangChain Workshop 2: Data Loaders and Output Parsers

In this notebook, we'll explore:
1. Loading data from interesting sources (YouTube, news, web pages)
2. Using output parsers to get structured responses
3. Building practical applications that combine both

## Setup: Loading Our API Keys Securely 🔐

First, let's load our environment variables. 

In real life, **NEVER** put API keys directly in your code! For this Kaggle workshop only, you can set your API key in the `KAGGLE_BACKUP` variable below. Though locally, you should use a `.env` file with the following content: 
```.env
OPENAI_API_KEY="your-key"
```

In [None]:
# Only if running this on Kaggle
KAGGLE_BACKUP = "sk-..."  # Replace with your OpenAI key for Kaggle only

In [None]:
try:
    import requests
    import langchain
    import langchain_openai
    import langchain_community
    from dotenv import load_dotenv
    import bs4
    import langchain_yt_dlp
    import feedparser
    import tqdm
    from newspaper import Article
except ImportError as e:
    !pip install requests python-dotenv langchain-openai langchain langchain-community beautifulsoup4 feedparser langchain-yt-dlp newspaper3k listparser lxml-html-clean tqdm
    import requests
    import langchain
    import langchain_openai
    import langchain_community
    from dotenv import load_dotenv
    import bs4
    import langchain_yt_dlp
    import feedparser
    from newspaper import Article
    import tqdm

In [None]:
import os
from langchain_openai import ChatOpenAI

load_dotenv()

# Load API key with Kaggle backup
api_key = os.getenv("OPENAI_API_KEY", KAGGLE_BACKUP)
if api_key:
    print(f"✅ API key loaded successfully: {api_key[:12]}...")
else:
    print("❌ No API key found. Make sure you have a .env file with OPENAI_API_KEY=")

llm = ChatOpenAI(model="gpt-5-nano", temperature=0.3, api_key=api_key)

## Part 1: Data Loaders - Getting Data from the Wild

Loading data from websites

In [None]:
from langchain_community.document_loaders import WebBaseLoader

# Load a Wikipedia page about something random 
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Special:Random")
docs = loader.load()

# Clean up page content to remove extra whitespace
for doc in docs:
    doc.page_content = " ".join(doc.page_content.split())

print(f"Loaded {len(docs)} documents")
print(f"Content length: {len(docs[0].page_content)} characters")
print("\nStart of article:")
print(docs[0].page_content[500:3000] + "...")

In [None]:
# Ask a question about the loaded content
from langchain.prompts import ChatPromptTemplate

# Use only start of article to keep token limits low. 
summary_prompt = ChatPromptTemplate.from_template(
    "Based on this content, answer in 1-4 sentences: {question}\n\nContent: {content}"
)

question = "What is this article about?"
response = llm.invoke(summary_prompt.format(
    question=question,
    content=docs[0].page_content[500:3000]
))

print(f"Q: {question}")
print(f"A: {response.content}")

Loading data from Youtube videos

In [None]:
# YouTube transcript loader (requires youtube-transcript-api)
from langchain_yt_dlp.youtube_loader import YoutubeLoaderDL

# Load any Youtube video
youtube_loader = YoutubeLoaderDL.from_youtube_url(
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Replace with actual video
    add_video_info=True
)

try:
    youtube_docs = youtube_loader.load()

    for doc in youtube_docs:
        doc.metadata['description'] = " ".join(doc.metadata['description'].split())

    print(f"YouTube description loaded: {len(youtube_docs[0].metadata['description'])} characters")
    print("\nFirst 300 characters:")
    print(youtube_docs[0].metadata['description'][:300] + "...")
    print(youtube_docs[0].metadata)
except Exception as e:
    print(f"YouTube loading failed (this is common): {e}")
    print("We'll use web content instead for the exercises")

In [None]:
# Ask a question if YouTube loading succeeded
if 'youtube_docs' in locals() and youtube_docs:
    question = "Summarize the main topic of this video in one sentence."
    response = llm.invoke(summary_prompt.format(
        question=question,
        content=youtube_docs[0].metadata['description']
    ))
    print(f"Q: {question}")
    print(f"A: {response.content}")
else:
    print("Skipping YouTube Q&A since loading failed")

Loading data from news articles

In [None]:
# RSS/News loader
from langchain_community.document_loaders import RSSFeedLoader

# Load recent news
rss_loader = RSSFeedLoader(
    urls=["https://feeds.bbci.co.uk/news/business/rss.xml?edition=int"],
    show_progress_bar=True
)

try:
    news_docs = rss_loader.load()
    for doc in news_docs:
        doc.page_content = " ".join(doc.page_content.split())

    print(f"Loaded {len(news_docs)} news articles")
    print("\nFirst article title:", news_docs[0].metadata.get('title', 'No title'))
    print("Summary:", news_docs[0].page_content[:200] + "...")
except Exception as e:
    print(f"RSS loading failed: {e}")

In [None]:
# Ask about the news articles if loading succeeded
if 'news_docs' in locals() and news_docs:
    # Combine first 3 article titles and summaries, truncated
    news_sample = "\n".join([
        f"{doc.metadata.get('title', 'No title')}: {doc.page_content[:500]}" 
        for doc in news_docs[:3]
    ])
    
    question = "What are the main themes in these articles? Use examples from the articles to justify your response."
    response = llm.invoke(summary_prompt.format(
        question=question,
        content=news_sample
    ))
    print(f"Q: {question}")
    print(f"A: {response.content}")
else:
    print("Skipping RSS Q&A since loading failed")

## Part 2: Output Parsers - Getting Structured Responses

In [None]:
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List

In [None]:
# Structured data with Pydantic
class MovieReview(BaseModel):
    title: str = Field(description="Movie title")
    rating: int = Field(description="Rating from 1-10")
    pros: List[str] = Field(description="List of positive aspects")
    cons: List[str] = Field(description="List of negative aspects")
    recommended: bool = Field(description="Whether you'd recommend it")

parser = PydanticOutputParser(pydantic_object=MovieReview)

prompt = PromptTemplate(
    template="Write a review for the movie '{movie}'.\n{format_instructions}",
    input_variables=["movie"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
)

response = llm.invoke(prompt.format(movie="The Matrix but everyone is a rubber duck"))
parsed_review = parser.parse(response.content)

print("Structured review:")
print(f"Title: {parsed_review.title}")
print(f"Rating: {parsed_review.rating}/10")
print(f"Pros: {parsed_review.pros}")
print(f"Cons: {parsed_review.cons}")
print(f"Recommended: {parsed_review.recommended}")

## Exercise 1: Bizarre News Summarizer

Create a system that loads weird Wikipedia pages and generates structured summaries with conspiracy-theory-style interpretations.

In [None]:
from pydantic import BaseModel, Field
from typing import List

# TODO: Define your conspiracy theory summary structure
class ConspiracySummary(BaseModel):
    # YOUR CODE HERE: Add fields for:
    # - title: str
    # - real_summary: str (actual factual summary)
    # - conspiracy_theory: str (humorous conspiracy interpretation)
    # - evidence_points: List[str] ("evidence" for the conspiracy)
    # - danger_level: int (1-10 scale)
    pass

# TODO: Create parser and prompt template

def analyze_weird_topic(wikipedia_url):
    """Load a Wikipedia page and generate a conspiracy analysis"""
    # YOUR CODE HERE:
    # 1. Load the webpage
    # 2. Use your structured parser to analyze it
    # 3. Return the parsed result
    pass

# Test with these weird Wikipedia topics:
weird_topics = [
    "https://en.wikipedia.org/wiki/Cargo_cult",
    "https://en.wikipedia.org/wiki/Kentucky_meat_shower",
    "https://en.wikipedia.org/wiki/Dancing_plague_of_1518"
]

# analyze_weird_topic(weird_topics[0])

### Solution:

In [None]:
class ConspiracySummary(BaseModel):
    title: str = Field(description="Topic title")
    real_summary: str = Field(description="Factual 2-sentence summary")
    conspiracy_theory: str = Field(description="Humorous conspiracy interpretation")
    evidence_points: List[str] = Field(description="List of 'evidence' supporting the conspiracy")
    danger_level: int = Field(description="Danger level from 1-10")

conspiracy_parser = PydanticOutputParser(pydantic_object=ConspiracySummary)

conspiracy_prompt = PromptTemplate(
    template="""
    Analyze this content and create both a factual summary and a humorous conspiracy theory interpretation:
    
    {content}
    
    {format_instructions}
    
    Make the conspiracy theory silly but creative. Include fake "evidence" points.
    """,
    input_variables=["content"],
    partial_variables={"format_instructions": conspiracy_parser.get_format_instructions()}
)

def analyze_weird_topic(wikipedia_url):
    loader = WebBaseLoader(wikipedia_url)
    docs = loader.load()
    
    # Use first 2000 characters to avoid token limits
    content = docs[0].page_content[:2000]
    
    response = llm.invoke(conspiracy_prompt.format(content=content))
    return conspiracy_parser.parse(response.content)

# Test it
result = analyze_weird_topic("https://en.wikipedia.org/wiki/Rubber_duck_debugging")
print(f"Title: {result.title}")
print(f"Real Summary: {result.real_summary}")
print(f"Conspiracy: {result.conspiracy_theory}")
print(f"Evidence: {result.evidence_points}")
print(f"Danger Level: {result.danger_level}/10")

## Summary

**Data Loaders**: Get content from web pages, YouTube, RSS feeds, and more.

**Output Parsers**: Transform unstructured AI responses into structured data you can actually use in applications.

**Key Patterns**:
- Use Pydantic models to define your desired output structure
- Include format instructions in your prompts
- Handle loading errors gracefully
- Truncate content to avoid token limits

Next up: Chat models, agents, and vectorstores!