In [1]:
import pandas as pd
import numpy as np
import datetime
from datetime import date

import re
import requests
from bs4 import BeautifulSoup as bs
import json

import tiktoken

In [None]:
# old version

# Function to extract text content after a specific title
def get_content_after_title(soup, title_text):
    """
    Finds the content after a given title (h1 or h2) until the next h1 or h2.
    """
    # Find the title tag (h1 or h2) with the exact text
    title = soup.find(lambda tag: tag.name in ['h1', 'h2'] and tag.get_text(strip=True) == title_text)
    if not title:
        print(f"Title '{title_text}' not found.")
        return None
    
    # Initialize an empty list to hold the content
    content = []
    
    # Iterate over the next siblings until the next h1 or h2
    for sibling in title.find_next_siblings():
        if sibling.name in ['h1', 'h2']:
            break
        # Append the text content of the sibling
        text = sibling.get_text(separator="\n", strip=True)
        if text:  # Avoid adding empty strings
            content.append(text)
    
    # Join the collected text into a single string
    return "\n".join(content) if content else None

def main():
    # URL of the web page to scrape
    url = 'https://buttondown.com/ainews/archive/ainews-shazeer-et-al-2024/'
    
    try:
        # Fetch the web page content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return
    
    # Parse the HTML content using BeautifulSoup with the 'lxml' parser
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Locate the main container holding the email content
    email_body = soup.find('div', class_='email-body-content')
    if not email_body:
        print("Couldn't find the 'email-body-content' div.")
        return
    
    # 1. Extract the Date
    date_tag = email_body.find('date')
    date_text = date_tag.get_text(strip=True) if date_tag else "Date not found."
    
    # 2. Extract the Post Title
    title_tag = email_body.find('h1', class_='subject')
    title_text = title_tag.get_text(strip=True) if title_tag else "Title not found."
    
    # 3. Extract the News Period from the first blockquote containing a date range
    blockquotes = email_body.find_all('blockquote')
    news_period = None
    for bq in blockquotes:
        # Search for a date range pattern like '3/28/2025-3/31/2025'
        match = re.search(r'\b(\d{1,2}/\d{1,2}/\d{4}-\d{1,2}/\d{1,2}/\d{4})\b', bq.get_text())
        if match:
            news_period = match.group(1)
            break
    if not news_period:
        news_period = "News period not found."
    
    # 4. Extract Content After Specific Titles
    ai_twitter_recap = get_content_after_title(soup, 'AI Twitter Recap')
    ai_reddit_recap = get_content_after_title(soup, 'AI Reddit Recap')
    ai_discord_recap = get_content_after_title(soup, 'AI Discord Recap')
    
    # Organize the extracted data into a dictionary
    extracted_data = {
        'Date': date_text,
        'Post Title': title_text,
        'News Period': news_period,
        'AI Twitter Recap': ai_twitter_recap,
        'AI Reddit Recap': ai_reddit_recap,
        'AI Discord Recap': ai_discord_recap
    }
    
    # Display the extracted information
    for key, value in extracted_data.items():
        print(f"{key}:\n{value}\n{'-'*50}")

if __name__ == "__main__":
    main()

In [15]:
file_path = 'C:\\Users\\Denis_Davydov2\\OneDrive\\Scipts\\Py_Scripts\\EPAM\\Prophet\\AI_skills\\'

print(file_path)
file_name = 'smolai_urls.txt'


links = pd.read_csv(file_path+file_name, header=None)
links

C:\Users\Denis_Davydov2\OneDrive\Scipts\Py_Scripts\EPAM\Prophet\AI_skills\


Unnamed: 0,0
0,https://news.smol.ai/issues/
1,https://news.smol.ai/issues/23-12-06-ainews-is...
2,https://news.smol.ai/issues/23-12-07-ainews-12...
3,https://news.smol.ai/issues/23-12-08-ainews-12...
4,https://news.smol.ai/issues/23-12-09-ainews-12...
...,...
492,https://news.smol.ai/issues/25-10-23-not-much
493,https://news.smol.ai/issues/25-10-24-not-much
494,https://news.smol.ai/issues/25-10-27-minimax-m2
495,https://news.smol.ai/issues/25-10-28-openai-re...


In [16]:
links['urls'] = links[0].astype(str)

#links = links.iloc[:10]

print("File read: ", file_name)
urls = links['urls'].tolist()
links

File read:  smolai_urls.txt


Unnamed: 0,0,urls
0,https://news.smol.ai/issues/,https://news.smol.ai/issues/
1,https://news.smol.ai/issues/23-12-06-ainews-is...,https://news.smol.ai/issues/23-12-06-ainews-is...
2,https://news.smol.ai/issues/23-12-07-ainews-12...,https://news.smol.ai/issues/23-12-07-ainews-12...
3,https://news.smol.ai/issues/23-12-08-ainews-12...,https://news.smol.ai/issues/23-12-08-ainews-12...
4,https://news.smol.ai/issues/23-12-09-ainews-12...,https://news.smol.ai/issues/23-12-09-ainews-12...
...,...,...
492,https://news.smol.ai/issues/25-10-23-not-much,https://news.smol.ai/issues/25-10-23-not-much
493,https://news.smol.ai/issues/25-10-24-not-much,https://news.smol.ai/issues/25-10-24-not-much
494,https://news.smol.ai/issues/25-10-27-minimax-m2,https://news.smol.ai/issues/25-10-27-minimax-m2
495,https://news.smol.ai/issues/25-10-28-openai-re...,https://news.smol.ai/issues/25-10-28-openai-re...


In [17]:
urls

['https://news.smol.ai/issues/',
 'https://news.smol.ai/issues/23-12-06-ainews-is-googles-gemini-legit',
 'https://news.smol.ai/issues/23-12-07-ainews-1272023-anthropic-says-skill-issue',
 'https://news.smol.ai/issues/23-12-08-ainews-1282023-mamba-v-mistral-v-hyena',
 'https://news.smol.ai/issues/23-12-09-ainews-1292023-the-mixtral-rush',
 'https://news.smol.ai/issues/23-12-10-ainews-12102023-not-much-happened-today',
 'https://news.smol.ai/issues/23-12-11-ainews-12112023-mixtral-beats-gpt35-and-llama2-70b',
 'https://news.smol.ai/issues/23-12-12-ainews-12122023-towards-langchain-01',
 'https://news.smol.ai/issues/23-12-13-ainews-12132023-solar107b-upstages-mistral7b',
 'https://news.smol.ai/issues/23-12-14-ainews-12142023-dollar1e7-for-superalignment',
 'https://news.smol.ai/issues/23-12-15-ainews-12152023-mixtral-instruct-beats-gemini-pro-and-matches-gpt35',
 'https://news.smol.ai/issues/23-12-16-ainews-12162023-bytedance-suspended-by-openai',
 'https://news.smol.ai/issues/23-12-18-a

In [None]:
# old 

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +http://effcon.com/)"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.text

def parse_section(html, start_id, end_id):
    """
    Parses the HTML and extracts all visible text between two <h1> sections identified by their IDs.
    
    Args:
        html (str): The HTML content to parse.
        start_id (str): The ID of the starting <h1> tag.
        end_id (str): The ID of the ending <h1> tag.
        
    Returns:
        dict: A dictionary containing the extracted content.
    """
    soup = BeautifulSoup(html, 'lxml')
    
    # Locate the starting <h1> tag
    start_tag = soup.find('h1', id=start_id)
    if not start_tag:
        print(f"Start tag with id '{start_id}' not found.")
        return {}
    
    # Locate the ending <h1> tag
    end_tag = soup.find('h1', id=end_id)
    if not end_tag:
        print(f"End tag with id '{end_id}' not found.")
        return {}
    
    # Initialize variables to store content
    extracted_text = []
    
    # Iterate over the next siblings after the start_tag until the end_tag
    for sibling in start_tag.find_next_siblings():
        if sibling == end_tag:
            break
        # If the sibling is a tag and visible, extract its text
        if sibling.name:
            # Optionally, skip certain tags (e.g., script, style)
            if sibling.name in ['script', 'style']:
                continue
            text = sibling.get_text(separator="\n", strip=True)
            if text:
                extracted_text.append(text)
        # If the sibling is a NavigableString (text outside tags)
        elif hasattr(sibling, 'strip'):
            text = sibling.strip()
            if text:
                extracted_text.append(text)
    
    # Join all extracted texts into a single string
    section_content = "\n".join(extracted_text)
    
    return {f"Content from <h1 id='{start_id}'> to <h1 id='{end_id}'>": section_content}

def main():
    # URL of the web page to scrape
    url = 'https://buttondown.com/ainews/archive/ainews-41b-raised-today-openai-300b-cursor-95b/'
    
    # IDs of the starting and ending <h1> tags
    start_id = 'ai-twitter-recap'
    end_id = 'part-1-high-level-discord-summaries'
    
    try:
        # Fetch the HTML content from the URL
        html_content = fetch_html(url)
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return
    except Exception as err:
        print(f"An error occurred: {err}")
        return
    
    # Parse the desired section
    extracted_data = parse_section(html_content, start_id, end_id)
    
    if extracted_data:
        # Save the extracted information to a JSON file
        with open('extracted_content.json', 'w', encoding='utf-8') as f:
            json.dump(extracted_data, f, ensure_ascii=False, indent=4)
        
        # Also, print the extracted content
        for key, value in extracted_data.items():
            print(f"{key}:\n{'='*len(key)}\n{value}\n")
        
        print("Content extracted and saved to 'extracted_content.json'.")
    else:
        print("No content extracted.")

if __name__ == "__main__":
    main()

In [2]:
# old - Vers 2

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +http://effcon.com/)"
    }
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.text

def parse_section(html, start_id, end_id):
    """
    Parses the HTML and extracts all visible text between two <h1> sections identified by their IDs.
    
    Args:
        html (str): The HTML content to parse.
        start_id (str): The ID of the starting <h1> tag.
        end_id (str): The ID of the ending <h1> tag.
        
    Returns:
        dict: A dictionary containing the extracted content.
    """
    soup = BeautifulSoup(html, 'lxml')
    
    # Locate the starting <h1> tag
    start_tag = soup.find('h1', id=start_id)
    if not start_tag:
        print(f"Start tag with id '{start_id}' not found.")
        return {}
    
    # Locate the ending <h1> tag
    end_tag = soup.find('h1', id=end_id)
    if not end_tag:
        print(f"End tag with id '{end_id}' not found.")
        return {}
    
    # Initialize variables to store content
    extracted_text = []
    
    # Iterate over the next siblings after the start_tag until the end_tag
    for sibling in start_tag.find_next_siblings():
        if sibling == end_tag:
            break
        # If the sibling is a tag and visible, extract its text
        if sibling.name:
            # Optionally, skip certain tags (e.g., script, style)
            if sibling.name in ['script', 'style']:
                continue
            text = sibling.get_text(separator="\n", strip=True)
            if text:
                extracted_text.append(text)
        # If the sibling is a NavigableString (text outside tags)
        elif hasattr(sibling, 'strip'):
            text = sibling.strip()
            if text:
                extracted_text.append(text)
    
    # Join all extracted texts into a single string
    section_content = "\n".join(extracted_text)
    
    return section_content
    

# Function to extract text content after a specific title
def get_content_after_title(soup, title_text):
    """
    Finds the content after a given title (h1 or h2) until the next h1 or h2.
    """
    # Find the title tag (h1 or h2) with the exact text
    title = soup.find(lambda tag: tag.name in ['h1', 'h2'] and tag.get_text(strip=True) == title_text)
    if not title:
        print(f"Title '{title_text}' not found.")
        return None
    
    # Initialize an empty list to hold the content
    content = []
    
    # Iterate over the next siblings until the next h1 or h2
    for sibling in title.find_next_siblings():
        if sibling.name in ['h1', 'h2']:
            break
        # Append the text content of the sibling
        text = sibling.get_text(separator="\n", strip=True)
        if text:  # Avoid adding empty strings
            content.append(text)
    
    # Join the collected text into a single string
    return "\n".join(content) if content else None

def main():
    # URL of the web page to scrape
    url = 'https://buttondown.com/ainews/archive/ainews-shazeer-et-al-2024/'
    
    try:
        # Fetch the web page content
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
    except requests.RequestException as e:
        print(f"Error fetching the URL: {e}")
        return
    
    # Parse the HTML content using BeautifulSoup with the 'lxml' parser
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Locate the main container holding the email content
    email_body = soup.find('div', class_='email-body-content')
    if not email_body:
        print("Couldn't find the 'email-body-content' div.")
        return
    
    # 1. Extract the Date
    date_tag = email_body.find('date')
    date_text = date_tag.get_text(strip=True) if date_tag else "Date not found."
    
    # 2. Extract the Post Title
    title_tag = email_body.find('h1', class_='subject')
    title_text = title_tag.get_text(strip=True) if title_tag else "Title not found."
    
    # 3. Extract the News Period from the first blockquote containing a date range
    blockquotes = email_body.find_all('blockquote')
    news_period = None
    for bq in blockquotes:
        # Search for a date range pattern like '3/28/2025-3/31/2025'
        match = re.search(r'\b(\d{1,2}/\d{1,2}/\d{4}-\d{1,2}/\d{1,2}/\d{4})\b', bq.get_text())
        if match:
            news_period = match.group(1)
            break
    if not news_period:
        news_period = "News period not found."
    
    # 4. Extract Content starting and ending <h1> tags
    start_id = 'ai-twitter-recap'
    end_id = 'part-1-high-level-discord-summaries'
    
    try:
        # Fetch the HTML content from the URL
        html_content = fetch_html(url)
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return
    except Exception as err:
        print(f"An error occurred: {err}")
        return
    
    # Parse the desired section
    extracted_content = parse_section(html_content, start_id, end_id)
        
    # Organize the extracted data into a dictionary
    extracted_data = {
        'Date': date_text,
        'Post Title': title_text,
        'News Period': news_period,
        'Content': extracted_content
    }
    
    # Display the extracted information
    for key, value in extracted_data.items():
        print(f"{key}:\n{value}\n{'-'*50}")

if __name__ == "__main__":
    main()

Date:
June 22, 2024
--------------------------------------------------
Post Title:
[AINews] Shazeer et al (2024): you are overpaying for inference >13x
--------------------------------------------------
News Period:
6/20/2024-6/21/2024
--------------------------------------------------
Content:
all recaps done by Claude 3 Opus, best of 4 runs. We are working on clustering and flow engineering with Haiku.
Claude 3.5 Sonnet Release by Anthropic
Improved Performance
:
@AnthropicAI
released Claude 3.5 Sonnet, outperforming competitor models on key evaluations at twice the speed of Claude 3 Opus and one-fifth the cost. It shows marked improvement in grasping nuance, humor, and complex instructions.
@alexalbert__
noted it passed
64% of internal pull request test cases
, compared to 38% for Claude 3 Opus.
New Features
:
@AnthropicAI
introduced Artifacts, allowing generation of docs, code, diagrams, graphics, and games that appear next to the chat for real-time iteration.
@omarsar0
used it to 

In [None]:
# return dataframe

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +http://effcon.com/)"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Added timeout
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL '{url}': {e}")
        return None

def parse_section(html, start_id, end_id, url):
    """Parses HTML and extracts text between two <h1> tags by ID."""
    if not html:
        return None  # Handle cases where fetching HTML failed
    soup = BeautifulSoup(html, 'lxml')
    start_tag = soup.find('h1', id=start_id)
    end_tag = soup.find('h1', id=end_id)

    if not start_tag or not end_tag:
        print(f"Start or end tag not found for '{start_id}' and '{end_id}'.")
        print(f"Start or end tag not found for '{start_id}' and '{end_id}': {url}")
        return None

    extracted_text = []
    for sibling in start_tag.find_next_siblings():
        if sibling == end_tag:
            break
        if sibling.name and sibling.name not in ['script', 'style']:
            text = sibling.get_text(separator="\n", strip=True)
            if text:
                extracted_text.append(text)
        elif hasattr(sibling, 'strip'):
            text = sibling.strip()
            if text:
                extracted_text.append(text)
    return "\n".join(extracted_text)


def extract_data(url):
    """Extracts data from a single URL."""
    html = fetch_html(url)
    if not html:
        return None # Return None if HTML fetching failed.
    soup = BeautifulSoup(html, 'lxml')
    email_body = soup.find('div', class_='email-body-content')
    if not email_body:
        print(f"Email body not found for '{url}'.")
        return None

    date_text = email_body.find('date').get_text(strip=True) if email_body.find('date') else None
    title_text = email_body.find('h1', class_='subject').get_text(strip=True) if email_body.find('h1', class_='subject') else None
    
    news_period = None
    for bq in email_body.find_all('blockquote'):
        match = re.search(r'\b(\d{1,2}/\d{1,2}/\d{4}-\d{1,2}/\d{1,2}/\d{4})\b', bq.get_text())
        if match:
            news_period = match.group(1)
            break

    start_id = 'ai-twitter-recap'
    end_id = 'part-1-high-level-discord-summaries'
    content = parse_section(html, start_id, end_id, url)

    return {'url': url, 'title': title_text, 'date': date_text, 'news_period': news_period, 'content': content}

def main(urls):
    """Processes multiple URLs and returns a DataFrame."""
    all_data = []
    for url in urls:
        data = extract_data(url)
        if data: #Append if data extraction is successful
            all_data.append(data)
    return pd.DataFrame(all_data)


if __name__ == "__main__":
    urls = urls
    df = main(urls)
    display(df.head(3))


In [42]:
# return dataframe  ver 3

def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; browser; +http://effcon.com/)"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Added timeout
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL '{url}': {e}")
        return None

def parse_section(html, start_id, end_id, url):
    """Parses HTML and extracts text between two <h1> tags by ID."""
    if not html:
        return None  # Handle cases where fetching HTML failed
    soup = bs(html, 'lxml')
    start_tag = soup.find('h1', id=start_id)
    end_tag = soup.find('h1', id=end_id)

    if not start_tag or not end_tag:
        print(f"Start or end tag not found for '{start_id}' and '{end_id}'.")
        print(f"Start or end tag not found for '{start_id}' and '{end_id}': {url}")
        return None

    extracted_text = []
    for sibling in start_tag.find_next_siblings():
        if sibling == end_tag:
            break
        if sibling.name and sibling.name not in ['script', 'style']:
            text = sibling.get_text(separator="\n", strip=True)
            if text:
                extracted_text.append(text)
        elif hasattr(sibling, 'strip'):
            text = sibling.strip()
            if text:
                extracted_text.append(text)
    return "\n".join(extracted_text)


def extract_data(url):
    """Extracts data from a single URL."""
    html = fetch_html(url)
    if not html:
        return None # Return None if HTML fetching failed.
    soup = bs(html, 'lxml')
    email_body = soup.find('div', class_='email-body-content')
    if not email_body:
        print(f"Email body not found for '{url}'.")
        return None

    date_text = email_body.find('date').get_text(strip=True) if email_body.find('date') else None
    title_text = email_body.find('h1', class_='subject').get_text(strip=True) if email_body.find('h1', class_='subject') else None
    
    news_period = None
    for bq in email_body.find_all('blockquote'):
        match = re.search(r'\b(\d{1,2}/\d{1,2}/\d{4}-\d{1,2}/\d{1,2}/\d{4})\b', bq.get_text())
        if match:
            news_period = match.group(1)
            break


    soup = bs(html.content, 'html.parser')
    # Get the main page text with tags
    content = soup.find_all(['title', 'h1', 'h2', 'p', 'li', 'ul', 'h3'])
    

    return {'url': url, 'title': title_text, 'date': date_text, 'news_period': news_period, 'content': content}

def main(urls):
    """Processes multiple URLs and returns a DataFrame."""
    all_data = []
    for url in urls:
        data = extract_data(url)
        if data: #Append if data extraction is successful
            all_data.append(data)
    return pd.DataFrame(all_data)


if __name__ == "__main__":
    urls = urls
    df = main(urls)
    display(df.head(3))


AttributeError: 'str' object has no attribute 'content'

In [8]:
# Iterate through all the UTL entities in the dataframe column df['Page_link']
for index, row in links.iterrows():
    page_link = row['urls']
    page = requests.get(page_link)
    # Parse the page content
    soup = bs(page.content, 'html.parser')
    # Get the page title and publication data
    title = soup.find('title').text
    # Get the main page text with tags
    page_text = soup.find_all(['title', 'h1', 'h2', 'p', 'li', 'ul', 'h3'])
    # Add page title and publication data
    links.at[index, 'Page_title'] = title
    # Write only text to the corresponding row of the dataframe
    links.at[index, 'Page_text'] = ' '.join([text.get_text() for text in page_text])

links

AttributeError: 'str' object has no attribute 'astype'

In [None]:
# Iterate through all the UTL entities in the dataframe column df['Page_link']
for index, row in links.iterrows():
    page_link = row['urls']
    page = requests.get(page_link)
    # Parse the page content
    soup = bs(page.content, 'html.parser')
    # Get the page title and publication data
    title = soup.find('title').text
    # Get the main page text with tags
    page_text = soup.find_all(['title', 'h1', 'h2', 'p', 'li', 'ul', 'h3'])
    # Add page title and publication data
    links.at[index, 'Page_title'] = title
    # Write only text to the corresponding row of the dataframe
    links.at[index, 'Page_text'] = ' '.join([text.get_text() for text in page_text])

links

# working version

In [77]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# Return HTML content
def fetch_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; WebScraper/1.0; +http://effcon.com/)"
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)  # Added timeout
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL '{url}': {e}")
        return None

# Parse all visible page text
def parse_page_text(html):
    """Parses HTML and returns all visible text on the page."""
    if not html:
        return None  # Handle cases where fetching HTML failed
    soup = BeautifulSoup(html, 'lxml')
    
    # Extract text from specific visible tags
    elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'li', 'ul'])
    visible_text = []
    
    for element in elements:
        text = element.get_text(separator="\n", strip=True)
        if text:
            visible_text.append(text)
    
    # Combine text content into a single string
    return "\n".join(visible_text)

# Extract data from a single URL
def extract_data(url):
    """Extracts data from a single URL."""
    html = fetch_html(url)
    if not html:
        return None  # Return None if HTML fetching failed
    
    soup = BeautifulSoup(html, 'lxml')
    
    # Extract email body content if available
    email_body = soup.find('div', class_='email-body-content')
    date_text = email_body.find('date').get_text(strip=True) if email_body and email_body.find('date') else None
    title_text = email_body.find('h1', class_='subject').get_text(strip=True) if email_body and email_body.find('h1', class_='subject') else None
    
    # Extract news period
    news_period = None
    if email_body:
        for bq in email_body.find_all('blockquote'):
            match = re.search(r'\b(\d{1,2}/\d{1,2}/\d{4}-\d{1,2}/\d{1,2}/\d{4})\b', bq.get_text())
            if match:
                news_period = match.group(1)
                break

    # Extract all visible page text
    content = parse_page_text(html)

    return {'url': url, 'title': title_text, 'date': date_text, 'news_period': news_period, 'content': content}

# Process multiple URLs and return a DataFrame
def main(urls):
    """Processes multiple URLs and returns a DataFrame."""
    all_data = []
    for url in urls:
        data = extract_data(url)
        if data:  # Append if data extraction is successful
            all_data.append(data)
    return pd.DataFrame(all_data)

# Execution
now1 = datetime.datetime.now()
print("start: ", now1)

if __name__ == "__main__":
    urls = urls
    raw_text = main(urls)
    display(raw_text.head(3))

now2 = datetime.datetime.now()
print("finish: ", now2)
print(now2-now1)

start:  2025-04-05 01:19:57.867639
Error fetching URL 'https://buttondown.com/ainews/archive/ainews-sxxx/': HTTPSConnectionPool(host='buttondown.com', port=443): Read timed out. (read timeout=10)
Error fetching URL 'https://buttondown.com/ainews/archive/ainews-talaria-apples-new-mlops-superweapon-4066/': HTTPSConnectionPool(host='buttondown.com', port=443): Read timed out. (read timeout=10)
Error fetching URL 'https://buttondown.com/ainews/archive/ainews-the-ultra-scale-playbook-training-llms-on/': HTTPSConnectionPool(host='buttondown.com', port=443): Read timed out. (read timeout=10)
Error fetching URL 'https://buttondown.com/ainews/archive/ainews-the-worlds-first-fully-autonomous-ai/': HTTPSConnectionPool(host='buttondown.com', port=443): Read timed out. (read timeout=10)
Error fetching URL 'https://buttondown.com/ainews/archive/ainews-tinyzero-reproduce-deepseek-r1-zero-for-30/': HTTPSConnectionPool(host='buttondown.com', port=443): Read timed out. (read timeout=10)


Unnamed: 0,url,title,date,news_period,content
0,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/10/2024: All the best papers for AI...,"January 11, 2024",,AI News\n[AINews] 1/10/2024: All the best pape...
1,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/11/2024: Mixing Experts vs Merging ...,"January 12, 2024",,AI News\n[AINews] 1/11/2024: Mixing Experts vs...
2,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/1/2024: How to start with Open Sour...,"January 3, 2024",,AI News\n[AINews] 1/1/2024: How to start with ...


finish:  2025-04-05 01:29:44.538359
0:09:46.670720


In [86]:
raw_text1 = raw_text.copy()
raw_text1['content'] = [bs(text).get_text() for text in raw_text1['content']]
raw_text1['content'] = [re.sub('\\n\\n+', '\n', ct) for ct in raw_text1['content']]
raw_text1['content'] = [re.sub('\ \ +', ' ', ct) for ct in raw_text1['content']]
raw_text1['content'] = raw_text1['content'].replace(r'\s+', ' ', regex=True)
raw_text1

Unnamed: 0,url,title,date,news_period,content
0,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/10/2024: All the best papers for AI...,"January 11, 2024",,AI News [AINews] 1/10/2024: All the best paper...
1,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/11/2024: Mixing Experts vs Merging ...,"January 12, 2024",,AI News [AINews] 1/11/2024: Mixing Experts vs ...
2,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/1/2024: How to start with Open Sour...,"January 3, 2024",,AI News [AINews] 1/1/2024: How to start with O...
3,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/12/2024: Anthropic coins Sleeper Ag...,"January 13, 2024",,AI News [AINews] 1/12/2024: Anthropic coins Sl...
4,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/13-14/2024: Don't sleep on #prompt-...,"January 16, 2024",,AI News [AINews] 1/13-14/2024: Don't sleep on ...
...,...,...,...,...,...
367,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome Interconnects and OpenRouter,"February 27, 2024",,AI News [AINews] Welcome Interconnects and Ope...
368,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome /r/LocalLlama!,"March 21, 2024",3/20/2024-3/21/2024,AI News [AINews] Welcome /r/LocalLlama! This i...
369,https://buttondown.com/ainews/archive/ainews-w...,[AINews] We Solved Hallucinations,"July 13, 2024",7/11/2024-7/12/2024,AI News [AINews] We Solved Hallucinations This...
370,https://buttondown.com/ainews/archive/ainews-x...,[AINews] X.ai Grok 3 and Mira Murati's Thinkin...,"February 18, 2025",2/17/2025-2/18/2025,AI News [AINews] X.ai Grok 3 and Mira Murati's...


In [88]:

now1 = datetime.datetime.now()
print("start: ", now1)


def remove_lines_starting_with_arrow(text):
    if not text:  # Handle empty or None content gracefully
        return text
    lines = text.split("\n")  # Split content into individual lines
    filtered_lines = [line for line in lines if not line.lstrip().startswith("▷")]  # Remove lines starting with '▷'
    return "\n".join(filtered_lines)  # Join the filtered lines back into a single string

# Apply the function to the content column
raw_text1['content'] = raw_text1['content'].apply(remove_lines_starting_with_arrow)


# Function to remove words containing unallowed symbols
def remove_unallowed_words(text):
    if not text:  # Handle empty or None content gracefully
        return text
    
    # Regex to keep words containing only allowed characters (letters, digits, punctuation, _, -, +, ·, \n)
    allowed_pattern = r"[a-zA-Z0-9._+\-·]+|\n"  # Allowed characters in a word or newline
    
    # Split text into words or lines, including retaining newline as a delimiter
    cleaned_text = []
    for part in text.split("\n"):  # Split by newline first to preserve "\n"
        words = part.split()  # Further split by whitespace to isolate words
        filtered_words = [word for word in words if re.fullmatch(allowed_pattern, word)]  # Retain only allowed words
        cleaned_text.append(" ".join(filtered_words))  # Reassemble lines
    
    # Rejoin lines separated by "\n"
    return "\n".join(cleaned_text)

# Apply the function to the content column
raw_text1['content'] = raw_text1['content'].apply(remove_unallowed_words)


# Function to clean the content
def clean_content(text):
    if not text:  # Handle empty/None content safely
        return text
    
    # Step 1: Delete text before 'Table of Contents' (case-insensitive)
    text = re.sub(r"^(.*?)Table of Contents", "Table of Contents", text, flags=re.IGNORECASE | re.DOTALL)
    
    # Step 2: Delete '\nthanks\n'
    text = text.replace("\nthanks\n", "\n")
    
    # Step 3: Delete '\nand\n'
    text = text.replace("\nand\n", "\n")
    text = text.replace("\n.\n", "\n")
    text = text.replace("\n. ", "\n")
    text = text.replace("\nin\n ", " in ")
    text = text.replace(" of\n ", " of ")
    text = text.replace(" by\n", " by ")
    text = text.replace("The\n", "The ")
    text = text.replace("A\n", "A ")
    text = text.replace("a\n", "a ")
    text = text.replace("Only 1 channel had so no need to summarize...", " ")
    
    

    # Step 4: Remove blank or nearly blank strings like '\n\n' or 'n \n'
    text = re.sub(r"(\n\s*\n)", "\n", text)  # Matches multiple newlines with optional spaces
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

# Apply the cleaning function to the 'content' column
raw_text1['content'] = raw_text1['content'].apply(clean_content)


# Function to remove line breaks before lowercase words
def remove_line_break_before_lowercase(text):
    if not text:  # Handle empty or None content safely
        return text
    # Use regex to remove '\n' before lowercase words
    return re.sub(r"\n([a-z])", r" \1", text)

# Apply the function to the 'content' column
raw_text1['content'] = raw_text1['content'].apply(remove_line_break_before_lowercase)



# Function to remove duplicate lines within each value
def remove_duplicate_lines(text):
    if not text:  # Handle empty/None content gracefully
        return text
    # Split the text into individual lines
    lines = text.split("\n")
    # Remove duplicate lines while preserving order, and join them back
    unique_lines = list(dict.fromkeys(lines))  # Use `dict.fromkeys` to preserve order and remove duplicates
    return "\n".join(unique_lines)

raw_text1['content'] = raw_text1['content'].apply(remove_duplicate_lines)

now2 = datetime.datetime.now()
print("finish: ", now2)
print(now2-now1)
raw_text1

start:  2025-04-05 02:05:48.080479
finish:  2025-04-05 02:06:15.004899
0:00:26.924420


Unnamed: 0,url,title,date,news_period,content
0,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/10/2024: All the best papers for AI...,"January 11, 2024",,Table of Contents GitHub - Notes from the Late...
1,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/11/2024: Mixing Experts vs Merging ...,"January 12, 2024",,Table of Contents Nous Research AI Discord Sum...
2,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/1/2024: How to start with Open Sour...,"January 3, 2024",,Table of Contents OpenAI Discord Summary Nous ...
3,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/12/2024: Anthropic coins Sleeper Ag...,"January 13, 2024",,Table of Contents Nous Research AI Discord Sum...
4,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/13-14/2024: Don't sleep on #prompt-...,"January 16, 2024",,Table of Contents OpenAI Discord Summary Nous ...
...,...,...,...,...,...
367,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome Interconnects and OpenRouter,"February 27, 2024",,Table of Contents PART Summary of Summaries of...
368,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome /r/LocalLlama!,"March 21, 2024",3/20/2024-3/21/2024,Table of Contents PART AI Twitter Recap PART S...
369,https://buttondown.com/ainews/archive/ainews-w...,[AINews] We Solved Hallucinations,"July 13, 2024",7/11/2024-7/12/2024,Table of Contents AI Twitter Recap AI Reddit R...
370,https://buttondown.com/ainews/archive/ainews-x...,[AINews] X.ai Grok 3 and Mira Murati's Thinkin...,"February 18, 2025",2/17/2025-2/18/2025,Table of Contents AI Twitter Recap AI Reddit R...


In [89]:
raw_text2 = raw_text1.copy()

raw_text2['date'] = pd.to_datetime(raw_text2['date'])

daymin = raw_text2['date'].min()
daymax = raw_text2['date'].max()

print(f"Minimum date: {daymin}")
print(f"Maximum date: {daymax}")
drange = (daymax-daymin).days
print(f'\nDate findings range from {daymin} to {daymax}, duration = {drange} days.')

raw_text2

Minimum date: 2023-11-04 00:00:00
Maximum date: 2025-04-03 00:00:00

Date findings range from 2023-11-04 00:00:00 to 2025-04-03 00:00:00, duration = 516 days.


Unnamed: 0,url,title,date,news_period,content
0,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/10/2024: All the best papers for AI...,2024-01-11,,Table of Contents GitHub - Notes from the Late...
1,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/11/2024: Mixing Experts vs Merging ...,2024-01-12,,Table of Contents Nous Research AI Discord Sum...
2,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/1/2024: How to start with Open Sour...,2024-01-03,,Table of Contents OpenAI Discord Summary Nous ...
3,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/12/2024: Anthropic coins Sleeper Ag...,2024-01-13,,Table of Contents Nous Research AI Discord Sum...
4,https://buttondown.com/ainews/archive/ainews-1...,[AINews] 1/13-14/2024: Don't sleep on #prompt-...,2024-01-16,,Table of Contents OpenAI Discord Summary Nous ...
...,...,...,...,...,...
367,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome Interconnects and OpenRouter,2024-02-27,,Table of Contents PART Summary of Summaries of...
368,https://buttondown.com/ainews/archive/ainews-w...,[AINews] Welcome /r/LocalLlama!,2024-03-21,3/20/2024-3/21/2024,Table of Contents PART AI Twitter Recap PART S...
369,https://buttondown.com/ainews/archive/ainews-w...,[AINews] We Solved Hallucinations,2024-07-13,7/11/2024-7/12/2024,Table of Contents AI Twitter Recap AI Reddit R...
370,https://buttondown.com/ainews/archive/ainews-x...,[AINews] X.ai Grok 3 and Mira Murati's Thinkin...,2025-02-18,2/17/2025-2/18/2025,Table of Contents AI Twitter Recap AI Reddit R...


In [97]:
# Add token count column
def num_tokens_from_string(string, encoding_name):
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string, disallowed_special=()))
    return num_tokens

raw_text2['token_count'] = raw_text2['content'].apply(lambda text:num_tokens_from_string(text, "cl100k_base"))
print("Number texts before duplecates cleaning: ",len(raw_text2))

# Drop duplicated posts
raw_text3 = raw_text2.drop_duplicates(subset=['title', 'url'])
raw_text3 = raw_text3.sort_values(by=['date'], ascending=False)

print("Number texts after duplecates cleaning: ",len(raw_text3))
print("tokens sum: ", raw_text3['token_count'].sum().sum())

Number texts before duplecates cleaning:  372
Number texts after duplecates cleaning:  372
tokens sum:  32988704


In [99]:
print("tokens max: ", raw_text3['token_count'].max())

tokens max:  8978109


In [None]:
print(raw_text2.iloc[201]['content'])

# Save result

In [98]:
# Save data 
file_path = r'C:\Users\Denis_Davydov2\OneDrive - EPAM\Prophet_AI_docs\Datasets\AI_skills\\'
file_name = f'ainews_{date.today()}.csv'

raw_text3.to_csv(file_path + file_name, index=False)
print(f"File {file_name} uploaded: ", date.today())

File ainews_2025-04-05.csv uploaded:  2025-04-05
