In [1]:
import requests # type: ignore
from bs4 import BeautifulSoup # type: ignore
import pandas as pd # type: ignore

In [2]:
def scrape_page(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    articles = []
    news_articles = soup.find_all('div', class_='flex-1 h-full sm:mt-1 sm:pb-1.5 flex-shrink flex flex-col justify-between overflow-hidden')
    
    for article in news_articles:
        headline_element = article.find('a', class_='my-1 sm:mb-0 text-sm sm:text-base font-semibold leading-snug sm:leading-5 line-clamp-3 sm:line-clamp-2 text-title')
        headline = headline_element.text.strip() if headline_element else 'No Headline'
        
        author_element = article.find('a', class_='text-xs truncate text-author')
        author = author_element.text.strip() if author_element else 'No Author'

        date_element = article.find('time', class_='flex-shrink-0 text-xs text-time')
        date = date_element.text.strip() if date_element else 'No Date'
        
        articles.append({
            'Headline': headline,
            'Author': author,
            'Date': date
        })
    
    return articles

def scrape_planetf1_articles(base_url, total_pages):
    all_articles = []
    
    for page in range(1, total_pages + 4):
        page_url = f"{base_url}?page={page}"
        articles = scrape_page(page_url)
        all_articles.extend(articles)
    
    return all_articles

base_url = 'https://www.planetf1.com/news'
total_pages = 2 
articles = scrape_planetf1_articles(base_url, total_pages)

df = pd.DataFrame({
    'Headline': [articles['Headline'] for articles in articles],
    'Author': [articles['Author'] for articles in articles], 
    'Date': [articles['Date'] for articles in articles] 
})

In [None]:
df.to_csv('Articles_Output.csv', index=False)

print("Data is successfully saved to 'Articles_Output.csv'.")

In [None]:
Valid_Headlines= len(df[df['Headline'] != 'No Headline'])
Valid_Authors = len(df[df['Author'] != 'No Author'])
Valid_Dates = len(df[df['Date'] != 'No Date'])

print(f"Total Valid Headlines: {Valid_Headlines}")
print(f"Total Valid Authors: {Valid_Authors}")
print(f"Total Valid Dates: {Valid_Dates}")

In [None]:
print(df)