In [1]:
import requests
import psycopg2
from bs4 import BeautifulSoup
from langchain_postgres import PGVector
from langchain_openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate

load_dotenv()

True

In [4]:
def get_articles_from_url(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    check404 = soup.find("body", class_="error404")
    if check404:
        return None
    
    articles_bulk = soup.find_all("ul", class_="wp-block-post-template")
    articles = articles_bulk[0].find_all("a", class_="loop-card__title-link")

    return [(article.text.strip(), article["href"]) for article in articles]

def get_article_content(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    check404 = soup.find("body", class_="error404")
    isLive = soup.find("span", class_="wp-block-techcrunch-storyline-hero__live")
    year = url.split("/")[3]

    if check404 or isLive or year != "2025":
        return None

    
    try: 
        title = soup.find("h1", class_="article-hero__title")
        if title is None:
            title = soup.find("h1", class_="wp-block-post-title").text.strip()
        else:
            title = title.text.strip()
                
        author = soup.find("a", class_="wp-block-tc23-author-card-name__link")
        if author is None:
            author = soup.find("a", class_="post-authors-list__author")
            if author is None:
                author = "anonymous"
            else: 
                author = author.text.strip()
        else:
            author = author.text.strip()
        author = author.replace("'", "’")

        time_div = soup.find("div", class_="wp-block-post-date")
        time = time_div.find("time").text.strip()

            
        image = soup.find("img", class_="attachment-post-thumbnail")["src"]
        
        category = soup.find("a", class_="is-taxonomy-category")

        if category is None:
            category = soup.find("span", class_="wp-block-tenup-post-primary-term").text.strip()
        else:
            category = category.text.strip()
        
        text_body = soup.find("div", class_="entry-content")
        text_paragraphs_and_titles = text_body.find_all(["p", "h2"])
        text = "\n".join([p.text.strip() for p in text_paragraphs_and_titles])
        text = f"{title}\n{text}"

    except:
        raise Exception("Error parsing article content for url", url)
        
    return {"title": title, "author": author, "time": time, "image": image, "category": category, "url": url}, text


def summarize_content(text, llm, idx, url):
    prompt = PromptTemplate.from_template("""Summarize the following news article in up to 4 sentences, building intrigue and interest. Do not include or rephrase the title in the summary, assuming that the audience has already read it. Do not ask questions in the summary. Make the summary comprehensive, but presented in simple terms, in such a way that invites the reader to ask you questions to find out the details. After that, provide a list of 3 questions that the reader can ask you to find out more details and clarifications.

Article: "{article}"

Answer exactly as in the following format:

Summary: 

Question 1:

Question 2:

Question 3:""")
    prompt = prompt.format(article=text)
    
    retries = 2
    for attempt in range(retries + 1):
        answer = llm.invoke(prompt)
        answer = answer.content.replace("'", "’")
        
        components = answer.split('\n')
        if len(components) >= 7:
            summary = components[0][9:]
            q1 = components[2][12:].strip()
            q2 = components[4][12:].strip()
            q3 = components[6][12:].strip()
            questions = "&&&".join([q1, q2, q3])
            return summary, questions
        elif attempt < retries:
            continue
        else:
            raise ValueError(f"Unexpected response format from language model: idx in total_articles {idx}, url {url}")
    
    

In [5]:
def insert_article_to_sql(cursor, metadata, llm, idx, text):
    cursor.execute(f"""select * from article where link = '{metadata["url"]}'""")
    if cursor.fetchone():
        print(f"{idx} - Article already exists in database: {metadata['url']}")
        return
    
    summary, questions = summarize_content(text, llm, idx, metadata["url"])

    cursor.execute(f"""insert into article VALUES(
        '{metadata["url"]}', '{metadata["title"]}', '{metadata["author"]}', '{metadata["time"]}', '{metadata["image"]}','{metadata["category"]}',
        '{summary}', '{questions}')""")
    cursor.execute("commit")
    print(f"{idx} - Inserted article {metadata['title']} into database")
    return

In [6]:
#TODO: change metadata time to date and remove metadata image

def insert_all_articles(total_articles, cursor, llm, vectorstore, text_splitter):
    all_docs = []

    for idx, article in enumerate(total_articles):
        result = get_article_content(article[1])
        if result is None:
            print(f"{idx} - Article not found or not supported: {article[1]}")
            continue
        article_data, text = result
    
        insert_article_to_sql(cursor, article_data, llm, idx, text)
        
        article_doc = Document(page_content=text, metadata=article_data)
        all_docs.append(article_doc)
    
    splits = text_splitter.split_documents(all_docs)

    _ = vectorstore.add_documents(documents=splits)
    print("All articles inserted into database")
    
    return

In [11]:
total_articles = []
URL = "https://techcrunch.com/2025/"

total_articles.extend(get_articles_from_url(URL))

for i in range(2, 1000):
    URL = f"https://techcrunch.com/2025/page/{i}"
    articles = get_articles_from_url(URL)
    if not articles:
        break
    total_articles.extend(articles)

print(len(total_articles))

298


In [7]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
#all_splits = text_splitter.split_documents(docs)

#print(f"Split blog post into {len(all_splits)} sub-documents.")

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection="postgresql+psycopg://stefan:gigelfrone112@localhost:5432/techvector",
)

In [2]:
llm = ChatOpenAI(model="gpt-4o-mini")
conn = psycopg2.connect("dbname=techvector user=stefan password=gigelfrone112 host=localhost port=5432")
cursor = conn.cursor()

In [46]:
insert_all_articles(total_articles, cursor, llm, vector_store, text_splitter)

0 - Article already exists in database: https://techcrunch.com/2025/01/11/apple-board-opposes-proposal-to-abolish-dei-programs/
1 - Article already exists in database: https://techcrunch.com/2025/01/11/researchers-open-source-sky-t1-a-reasoning-ai-model-that-can-be-trained-for-less-than-450/
2 - Article already exists in database: https://techcrunch.com/2025/01/11/whats-behind-ballooning-video-game-budgets/
3 - Article already exists in database: https://techcrunch.com/2025/01/11/everything-you-missed-at-ces-2025/
4 - Article already exists in database: https://techcrunch.com/2025/01/11/i-got-soaked-driving-the-arc-sport-electric-boat/
5 - Article already exists in database: https://techcrunch.com/2025/01/11/matt-mullenweg-deactivates-wordpress-accounts-of-contributors-planning-a-fork/
6 - Article already exists in database: https://techcrunch.com/2025/01/11/nvidias-ai-empire-a-look-at-its-top-startup-investments/
7 - Article already exists in database: https://techcrunch.com/2025/01/1

In [100]:
data, text = get_article_content("https://techcrunch.com/2025/01/09/innovaccer-aims-to-become-healthcares-ai-powerhouse-with-275m-series-f/")

In [101]:
insert_article_to_sql(cursor, data, 'caca', 'maca')

In [29]:
cursor.execute("rollback")

In [113]:
data, text = get_article_content('https://techcrunch.com/2025/01/07/feds-investigate-teslas-actual-smart-summon-after-several-crashes/')

In [114]:
results = summarize_content(text, llm)

In [115]:
print(results)

('An investigation has been launched by the National Highway Traffic Safety Administration into 2.56 million Tesla cars following multiple crashes linked to the newly released "Actual Smart Summon" feature. This advanced remote parking capability, which allows vehicles to navigate autonomously to their owners using only cameras, has raised safety concerns after reports of incidents where the cars failed to detect obstacles. Complaints indicate that the system may not provide drivers adequate time to react before a collision occurs, prompting further scrutiny into its reliability. As Tesla continues to innovate in autonomous technology, the implications of this investigation could have significant repercussions for the company and its users.', 'What specific incidents prompted the NHTSA to initiate the investigation into Tesla’s Smart Summon feature?&&&How does the "Actual Smart Summon" differ from the previous version of the Smart Summon feature?&&&What potential consequences could Tes

In [None]:
#s-a oprit la al [72]-lea articol inclusiv, reia de la 73; summary-urile deja au fost bagate in db

In [19]:
print(list(enumerate(total_articles)))

[(0, ('Apple board opposes proposal to abolish DEI programs', 'https://techcrunch.com/2025/01/11/apple-board-opposes-proposal-to-abolish-dei-programs/')), (1, ('Researchers open source Sky-T1, a ‘reasoning’ AI model that can be trained for less than $450', 'https://techcrunch.com/2025/01/11/researchers-open-source-sky-t1-a-reasoning-ai-model-that-can-be-trained-for-less-than-450/')), (2, ('What’s behind ballooning video game budgets?', 'https://techcrunch.com/2025/01/11/whats-behind-ballooning-video-game-budgets/')), (3, ('Everything you missed at CES 2025', 'https://techcrunch.com/2025/01/11/everything-you-missed-at-ces-2025/')), (4, ('I got soaked driving the Arc Sport electric boat', 'https://techcrunch.com/2025/01/11/i-got-soaked-driving-the-arc-sport-electric-boat/')), (5, ('Matt Mullenweg deactivates WordPress contributor accounts over alleged fork plans', 'https://techcrunch.com/2025/01/11/matt-mullenweg-deactivates-wordpress-accounts-of-contributors-planning-a-fork/')), (6, ('N

In [17]:
def get_author(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    check404 = soup.find("body", class_="error404")
    isLive = soup.find("span", class_="wp-block-techcrunch-storyline-hero__live")
    year = url.split("/")[3]

    if check404 or isLive or year != "2025":
        return None


    
    try:                 
        author = soup.find("a", class_="wp-block-tc23-author-card-name__link")
        if author is None:
            author = soup.find("a", class_="post-authors-list__author")
            if author is None:
                author = "anonymous"
            else: 
                author = author.text.strip()
        else:
            author = author.text.strip()
        
        author = author.replace("'", "’")
    

    except:
        raise Exception("Error parsing article content for url", url)
        
    return author

In [7]:
def populate_author_column():
    print("started")
    cursor.execute("select link from article")
    print("executed")
    links = cursor.fetchall()
    print("fetched")
    for idx, link in enumerate(links):
        print(f"{idx} - Updating author for article {link}")
        author = get_author(link[0])
        cursor.execute(f"""update article set author = '{author}' where link = '{link[0]}'""")
        cursor.execute("commit")

In [19]:
populate_author_column()

started
executed
fetched
0 - Updating author for article ('https://techcrunch.com/2025/01/09/watch-duty-surpasses-chatgpt-as-top-free-app-on-app-store-as-california-fires-spread/',)
1 - Updating author for article ('https://techcrunch.com/2025/01/09/googles-daily-listen-ai-feature-generates-a-podcast-based-on-your-discover-feed/',)
2 - Updating author for article ('https://techcrunch.com/2025/01/09/doublepoint-launches-free-apple-watch-app-to-control-devices-with-hand-gestures/',)
3 - Updating author for article ('https://techcrunch.com/2025/01/09/google-searches-for-deleting-facebook-instagram-explode-after-meta-ends-fact-checking/',)
4 - Updating author for article ('https://techcrunch.com/2025/01/09/innovaccer-aims-to-become-healthcares-ai-powerhouse-with-275m-series-f/',)
5 - Updating author for article ('https://techcrunch.com/2025/01/09/tesla-directors-to-pay-up-to-919-million-to-settle-claims-they-overpaid-themselves/',)
6 - Updating author for article ('https://techcrunch.com/2

In [3]:
cursor.execute("SELECT * FROM pg_stat_activity WHERE state = 'active';")

In [4]:
for row in cursor:
    print(row)

(16384, 'techvector', 6847, None, 16385, 'stefan', '', '127.0.0.1', None, 54990, datetime.datetime(2025, 1, 13, 8, 0, 13, 749892, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 1, 13, 8, 0, 20, 957313, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 1, 13, 8, 0, 20, 959947, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), datetime.datetime(2025, 1, 13, 8, 0, 20, 959947, tzinfo=datetime.timezone(datetime.timedelta(seconds=7200))), None, None, 'active', None, '1072', None, "SELECT * FROM pg_stat_activity WHERE state = 'active';", 'client backend')


In [10]:
cursor.execute("rollback;")

In [18]:
cursor.execute("SELECT pg_terminate_backend(13100);")

In [18]:
print(get_author("https://techcrunch.com/2025/01/11/i-got-soaked-driving-the-arc-sport-electric-boat/"))

Sean O’Kane
