In [None]:
import pdfplumber
import logging
logging.getLogger("pdfminer").setLevel(logging.ERROR)
text = ""
with pdfplumber.open("Conservatives in Academia.pdf") as pdf:
    for page in pdf.pages:
        text += page.extract_text()



clean_text = text
word_count = len(clean_text.split())
print("word count:",word_count, "\n\n",clean_text)

word count: 1597 

 Volume, Number, Month: Volume I, Number 2, April 2025 [editors will take care of this]
Headline: Affirmative Action, But Make It Conservative
Subhead: An old liberal idea might just be the solution to a new conservative problem.
Tags:
Author(s): Oren Hartstein
Author Position (guest contributor, or staff position): [position]
Author Byline: Mr. Hartstein is a sophomore at Columbia College studying physics and
math. He is a senior editor for Sundial.
Twitter Post (~280 characters before link)
Instagram Post Caption (medium paragraph)
üîó
Full article at the link in bio.
Article Body
REIMP
There is a war between Trump and Columbia‚Äîat least, that ºs what most Columbia
students would like you to believe. They tend to frame the current fight with the Trump
administration as just that‚Äînothing more than a fight with Trump. It ºs an easy story to
tell, because it portrays such tensions as a one-off political squabble. For many, that
narrative is far preferable to reali

In [163]:
from langchain_openai import ChatOpenAI
gpt4o_chat = ChatOpenAI(model="gpt-4o", temperature=0)
gpt35_chat = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)

In [164]:
from langchain_core.messages import SystemMessage
sys_msg = """You are a helpful assistant that generates article pull-out quotes. 
Each pull out quote should be 30-70 words and should capture the main themes of the article. 
The pull out quotes MUST be direct quotations from the article. 
DO NOT summarize or otherwise modify the direct quotations in any way. A quote can be 1-3 sentences long.

article: \n {article}
"""

In [165]:
from pydantic import BaseModel
from typing import TypedDict

class Quotes(BaseModel):
    quotes: list[str]

class State(TypedDict):
    article: str
    quotes: Quotes


In [166]:
llm = gpt4o_chat.with_structured_output(Quotes)

In [167]:
from langgraph.types import Send
#Node
def quote_generator(state: State):
    return {"quotes": llm.invoke([SystemMessage(sys_msg.format(article=state["article"]))])}

#Conditional Edge
def send_to_validator(state: State):
    return [Send("validator", {"article": state["article"], "quote": q}) for q in state["quotes"]]


In [168]:
from langgraph.graph import START, END, StateGraph
builder = StateGraph(State)

builder.add_node("quote_generator", quote_generator)
builder.add_edge(START, "quote_generator")
builder.add_edge("quote_generator", END)

graph = builder.compile()

config = {"configurable": {"thread_id": "3"}}
result = graph.invoke({"article": clean_text}, config=config)

In [None]:
import re
from typing import List, Dict, Any

def strip_outer_quotes(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return text.strip().strip("\"'‚Äú‚Äù‚Äò‚Äô")

def normalize_text(text: str) -> str:
    """
    Normalizes text by stripping wrapping quotes, unifying punctuation, fixing common
    PDF spacing issues, and standardizing whitespace. Returns lowercase for robust matching.
    """
    if not isinstance(text, str):
        return ""
    # Remove wrapping quotes first
    text = strip_outer_quotes(text)
    # Normalize unicode punctuation and spaces
    replacements = {
        "\u201c": '"', "\u201d": '"',
        "\u2018": "'", "\u2019": "'", "\u02BC": "'",  # apostrophes
        "\u2013": "-", "\u2014": "-", "\u2212": "-", "\u2012": "-", "\u2011": "-",  # dashes
        "\u2026": "...",  # ellipsis
        "\u00A0": " ",    # nbsp
    }
    for src, tgt in replacements.items():
        text = text.replace(src, tgt)
    # Fix missing space after punctuation (common in PDFs)
    text = re.sub(r'([.,!?;:])([A-Za-z])', r'\1 \2', text)
    # Normalize dash spacing
    text = re.sub(r'\s*-\s*', '-', text)
    # Collapse whitespace and lowercase
    text = re.sub(r'\s+', ' ', text).strip().lower()
    return text

In [192]:
norm_article = normalize_text(clean_text)
quotes_list = result["quotes"].quotes
for quote in quotes_list:
    norm_quote = normalize_text(quote)
    if norm_quote in norm_article:
        continue
    # Very forgiving secondary check (ignore punctuation and spacing)
    stripped_article = re.sub(r'[^a-z0-9]+', ' ', norm_article)
    stripped_quote = re.sub(r'[^a-z0-9]+', ' ', norm_quote)
    if stripped_quote and stripped_quote in stripped_article:
        continue
    # If both checks fail, report an error
    print("ERROR: Quote not in article")
    print(norm_quote)
    print("-"*300)


'"Trump isn ºt the root of this hostility so much as he is the loudest expression of it. The evidence is conclusive: Columbia is indisputably dominated by liberals."'