In [8]:
import requests
from bs4 import BeautifulSoup
from langchain_postgres import PGVector
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

load_dotenv()

True

In [53]:
def get_articles_from_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    check404 = soup.find("body", class_="error404")
    if check404:
        return None
    
    articles_bulk = soup.find_all("ul", class_="wp-block-post-template")
    articles = articles_bulk[0].find_all("a", class_="loop-card__title-link")

    return [(article.text.strip(), article["href"]) for article in articles]

def get_article_content(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    check404 = soup.find("body", class_="error404")
    isLive = soup.find("span", class_="wp-block-techcrunch-storyline-hero__live")
    if check404 or isLive:
        return None
    if check404:
        raise Exception("Page not found for url", url)
        
    title = soup.find("h1", class_="article-hero__title")
    if title is None:
        title = soup.find("h1", class_="wp-block-post-title").text.strip()
    else:
        title = title.text.strip()
            
    author = soup.find("a", class_="wp-block-tc23-author-card-name__link")
    if author is None:
        author = soup.find("a", class_="post-authors-list__author").text.strip()
    else:
        author = author.text.strip()
        
    time_div = soup.find("div", class_="wp-block-post-date")
    time = time_div.find("time").text.strip()

        
    image = soup.find("img", class_="attachment-post-thumbnail")["src"]
    
    category = soup.find("a", class_="is-taxonomy-category")

    if category is None:
        category = soup.find("span", class_="wp-block-tenup-post-primary-term").text.strip()
    else:
        category = category.text.strip()
    
    text_body = soup.find("div", class_="entry-content")
    text_paragraphs_and_titles = text_body.find_all(["p", "h2"])
    text = "\n".join([p.text.strip() for p in text_paragraphs_and_titles])
    text = f"{title}\n{text}"
    
        
    return {"title": title, "author": author, "time": time, "image": image, "category": category, "url": url}, text


def summarize_content(text, llm):
    prompt = PromptTemplate.from_template("""Summarize the following news article in up to 4 sentences, building intrigue and interest. Do not include or rephrase the title in the summary, assuming that the audience has already read it. Do not ask questions in the summary. Make the summary comprehensive, but presented in simple terms, in such a way that invites the reader to ask you questions to find out the details. After that, provide a list of 3 questions that the reader can ask you to find out more details and clarifications.

Article: "{article}"

Answer exactly as in the following format:

Summary: 

Question 1:

Question 2:

Question 3:""")
    prompt = prompt.format(article=text)
    answer = llm.invoke(prompt)

    components = answer.content.split('\n')
    summary = components[0][9:]
    q1 = components[2][12:].strip()
    q2 = components[4][12:].strip()
    q3 = components[6][12:].strip()
    questions = "&&&".join([q1, q2, q3])

    return summary, questions
    
    

In [3]:
print(get_article_content("https://techcrunch.com/storyline/live-updates-ces-2025-nvidia-samsung-sony-toyota-reveals-plus-more/"))


None


In [5]:
total_articles = []
URL = "https://techcrunch.com/2025/"

total_articles.extend(get_articles_from_url(URL))

for i in range(2, 1000):
    URL = f"https://techcrunch.com/2025/page/{i}"
    articles = get_articles_from_url(URL)
    if not articles:
        break
    total_articles.extend(articles)

print(len(total_articles))

240


In [6]:
data, text = get_article_content("https://techcrunch.com/2025/01/01/internal-spacex-documents-show-the-sweet-stock-deals-offered-to-investors-like-a16z-gigafund/")
print(data)
print(text)

{'title': 'Internal SpaceX documents show the sweet stock deals offered to investors like a16z, Gigafund', 'author': 'Julie Bort', 'time': '8:04 AM PST · January 1, 2025', 'image': 'https://techcrunch.com/wp-content/uploads/2020/05/GettyImages-1216300269.jpg?w=1024', 'category': 'Fundraising', 'url': 'https://techcrunch.com/2025/01/01/internal-spacex-documents-show-the-sweet-stock-deals-offered-to-investors-like-a16z-gigafund/'}
Internal SpaceX documents show the sweet stock deals offered to investors like a16z, Gigafund
Like many highly valued startups, SpaceX sometimes allows its employees to cash out some of their shares by selling to company-authorized outside investors.
TechCrunch has gotten a peek at an internal SpaceX document about such a tender offer from May 2022. Musk posted on X last month that SpaceX holds such sales for employees about every 6 months.
With SpaceX’s latest tender offer sale, which occurred in December, 2024, according to CNBC, now valuing the company at $3

In [55]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
#all_splits = text_splitter.split_documents(docs)

#print(f"Split blog post into {len(all_splits)} sub-documents.")

In [39]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection="postgresql+psycopg://stefan:gigelfrone112@localhost:5432/techvector",
)

In [50]:
from sqlalchemy import create_engine
from sqlalchemy.orm import Session

# an Engine, which the Session will use for connection
# resources
engine = create_engine("postgresql+psycopg://stefan:gigelfrone112@localhost:5432/techvector")

# create session and add objects
with Session(engine) as session:
    coll = vector_store.get_collection(session)
    print(len(coll.embeddings))

0


In [100]:
all_docs = []

for article in total_articles:
    result = get_article_content(article[1])
    if result is None:
        continue
    article_data, text = result
    article_doc = Document(page_content=text, metadata=article_data)
    all_docs.append(article_doc)

splits = text_splitter.split_documents(all_docs)
# all_splits.extend(splits)

In [103]:
print(len(splits))
print(splits[2])

436
page_content='Will.i.am is flogging electronics again
Will.i.am’s history with consumer tech has been — let’s just say “spotty.” The musician/entrepreneur loves gadgets, this much we can say for certain. For his latest trick, the Black Eyed Pea is returning as LG’s “Experiential Architect” in conjunction with CES 2025.
What does such a role entail? Suggesting ideas for Bluetooth speakers and headphones, apparently. LG on Monday unveiled its first collaboration with the entertainer, Xboom by Will.i.am. Products in the line will be “professionally tuned by Will.i.am,” per LG.
At launch, the Xboom by Will.i.am line includes three speakers — the Xboom Bounce, Grab, and Stage 301 — along with a pair of Xboom Buds.' metadata={'title': 'Will.i.am is flogging electronics again', 'author': 'Brian Heater', 'time': '10:15 AM PST · January 6, 2025', 'image': 'https://techcrunch.com/wp-content/uploads/2025/01/lg-william.jpg?w=1024', 'category': 'Hardware', 'url': 'https://techcrunch.com/2025/01

In [None]:
#done: scraping, document creation, splitting
#todo: postgres table creation (with url as pk), data insertion (first in sql, then in vectorstore), 


In [15]:
llm = ChatOpenAI(model="gpt-4o-mini")


In [54]:
llm_invoke_test = summarize_content(text, llm)

In [56]:
print(llm_invoke_test)

("Recent internal documents from SpaceX reveal intriguing details about a tender offer that allowed employees to sell their shares to select investors, including prominent firms like Andreessen Horowitz and Gigafund. While employees received shares at a discounted price of $70 each, the original prices in primary rounds were significantly higher, raising questions about the implications for employee equity and potential payouts. The documents also highlight the intricate dynamics between common and preferred stock, including the preferential treatment of preferred shareholders in the event of a sale. With SpaceX's valuation soaring to $350 billion, the stakes for these secondary sales are higher than ever, especially for those holding common stock.", "What are the key differences between common and preferred stock in the context of SpaceX's offerings?&&&How do the recent tender offers and share prices compare to those from previous years?&&&Who are the notable investors involved in thi

In [42]:
print(summary, q1, q2, q3, sep="\n")

Internal SpaceX documents reveal the lucrative stock deals available to select investors, including prominent firms like Andreessen Horowitz and Gigafund, highlighting the stark contrast between employee share prices and those offered to investors. As of late 2024, SpaceX's valuation soared to an astonishing $350 billion, with authorized buyers able to purchase shares at just $70 each, a fraction of the $270 price during the last primary investment round. While these secondary sales provide employees a rare opportunity to liquidate their shares, they also underscore the preferential treatment afforded to preferred shareholders in the event of a company sale. With the next tender offer potentially reaching up to $110 per share, the dynamics of SpaceX's funding strategies continue to intrigue investors and employees alike.
How do the share prices for employees compare to those for primary investors in SpaceX? 
What are the implications of the liquidation preferences for common stockholde

In [48]:
questions = "&&&".join([q1, q2, q3])

In [49]:
questions

'How do the share prices for employees compare to those for primary investors in SpaceX? &&&What are the implications of the liquidation preferences for common stockholders if SpaceX were to be sold? &&&Which notable investors were authorized to participate in the recent stock sale, and what connections do they have to Elon Musk?'