In [15]:
#Importing Libraries

import csv
import requests
from time import sleep
from datetime import datetime
from bs4 import BeautifulSoup
import time

In [2]:
#1. Configuration
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
    "Accept-Language": "en-US,en;q=0.9"
}

BASE_URL = "https://techcrunch.com/latest/page/{}/"

In [3]:
#2. Filename Generator

def generate_filename():
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"techcrunch_latest_{timestamp}.csv"
    return filename

In [4]:
#3. CSV Writer
def save_data_to_csv(records, filename, write_header=False):
    header = ["author", "title", "category", "datetime", "url"]

    with open(filename, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(header)
        for record in records:
            writer.writerow(record)

In [17]:
#4. HTML Fetch Function
def get_page_html(page):
    url = BASE_URL.format(page)
    response = requests.get(url, timeout = 30, headers=HEADERS)

    if response.status_code != 200:
        return None

    return response.text


In [19]:
#5. Article List Extractor
def parse_articles(html):
    soup = BeautifulSoup(html, 'lxml')

    ul = soup.find("ul", class_="wp-block-post-template is-layout-flow wp-block-post-template-is-layout-flow")
    if not ul:
        return []

    return ul.find_all("li", class_="wp-block-post")


In [11]:
#6. Single Article - Details
def extract_article_data(article):
    try:
        link = article.find("a", class_ = "loop-card__title-link").get("href", "")
    except:
        return None

    author_tag = article.find("a", class_="loop-card__author")
    author = author_tag.text.strip() if author_tag else ""

    title_tag = article.find("a", class_="loop-card__title-link")
    title = title_tag.text.strip() if title_tag else ""

    time_tag = article.find("time")
    datetime_val = time_tag.get("datetime", "") if time_tag else ""

    category_tag = article.find("a", class_="loop-card__cat")
    category = category_tag.text.strip() if category_tag else ""

    return [author, title, category, datetime_val, link]


In [12]:
#7. Main Scraper
def scrape_techcrunch(max_pages=5):
    filename = generate_filename()
    save_data_to_csv([], filename, write_header=True)

    total_records = 0

    for page in range(1, max_pages + 1):
        print(f"Scraping page: {page}")

        html = get_page_html(page)
        if html is None:
            print("Reached end of site. Stopping.")
            break

        articles = parse_articles(html)
        if not articles:
            print("No articles found on this page. Stopping.")
            break

        page_records = []
        for article in articles:
            record = extract_article_data(article)
            if record:
                page_records.append(record)

        last_record = page_records[-1]              # oldest article on page
        datetime_val = last_record[3]               # index 3 = datetime column
        year = int(datetime_val[:4])                # extract year

        if year < 2025:
            print(f"Old data detected (year {year}). Stopping scrape.")
            save_data_to_csv(page_records, filename)  # save final page
            total_records += len(page_records)
            break

        save_data_to_csv(page_records, filename)
        total_records += len(page_records)

       

        sleep(2)

    print(f"Scraping complete. Total articles scraped: {total_records}")
    print(f"Saved to: {filename}")


In [18]:
if __name__ == "__main__":
    scrape_techcrunch(max_pages=350)


Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraping page: 5
Scraping page: 6
Scraping page: 7
Scraping page: 8
Scraping page: 9
Scraping page: 10
Scraping page: 11
Scraping page: 12
Scraping page: 13
Scraping page: 14
Scraping page: 15
Scraping page: 16
Scraping page: 17
Scraping page: 18
Scraping page: 19
Scraping page: 20
Scraping page: 21
Scraping page: 22
Scraping page: 23
Scraping page: 24
Scraping page: 25
Scraping page: 26
Scraping page: 27
Scraping page: 28
Scraping page: 29
Scraping page: 30
Scraping page: 31
Scraping page: 32
Scraping page: 33
Scraping page: 34
Scraping page: 35
Scraping page: 36
Scraping page: 37
Scraping page: 38
Scraping page: 39
Scraping page: 40
Scraping page: 41
Scraping page: 42
Scraping page: 43
Scraping page: 44
Scraping page: 45
Scraping page: 46
Scraping page: 47
Scraping page: 48
Scraping page: 49
Scraping page: 50
Scraping page: 51
Scraping page: 52
Scraping page: 53
Scraping page: 54
Scraping page: 55
Scraping page: 56
S