In [3]:
import requests
import csv
import datetime
import pickle
import os
from bs4 import BeautifulSoup

# List of websites to scrape
websites = [
    {
        'url': 'https://www.mirah.com/',
        'name': 'Mirah'
    },
    {
        'url': 'https://www.owl.health/',
        'name': 'Owlhealth'
    },
    # Add more websites here
]

target_words = ['a', 'the', 'develop', 'an', 'development', 'progress']

# Specify the CSV and pickle file paths
csv_file = 'scraped_data.csv'
pickle_file = 'scraped_data.pickle'


def scrape_websites():
    # Initialize the data list to store the scraped information
    data = []

    # Iterate over the websites
    for website in websites:
        try:
            # Send a GET request to the website
            response = requests.get(website['url'])

            # Create a BeautifulSoup object to parse the HTML content
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find the relevant elements on the webpage and extract the desired information
            titles = soup.find_all('h1')
            texts = soup.find_all('p')

            # Extract the text from the elements
            titles_text = [title.get_text().strip() for title in titles]
            texts_text = [text.get_text().strip() for text in texts]

            # Combine the extracted data into rows
            rows = zip([website['name']] * len(titles_text), titles_text, texts_text)

            # Append the rows to the data list
            data.extend(rows)

        except requests.RequestException as e:
            print(f"Error scraping {website['name']}: {e}")

    return data


def save_to_csv(data, filename):
    # Write the scraped data to the CSV file
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Website', 'Title', 'Text'])
        writer.writerows(data)

    print(f"Scraped data saved in {filename} file.")


def convert_to_pickle(filename):
    # Read the CSV file
    with open(filename, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        data = list(reader)

    # Convert the data to pickle format
    with open(pickle_file, 'wb') as file:
        pickle.dump(data, file)

    print(f"Converted {filename} to {pickle_file}.")


def run_weekly_scraping():
    # Get the current date
    current_date = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    # Generate a unique filename using the current date and time
    new_csv_file = f"scraped_data_{current_date}.csv"

    # Run the scraping and save to CSV
    scraped_data = scrape_websites()
    save_to_csv(scraped_data, new_csv_file)

    # Convert the CSV file to a pickle file
    convert_to_pickle(new_csv_file)


if __name__ == '__main__':
    run_weekly_scraping()


Scraped data saved in scraped_data_2023-07-14_12-24-49.csv file.
Converted scraped_data_2023-07-14_12-24-49.csv to scraped_data.pickle.
