In [3]:
import requests
import csv
import pickle
from bs4 import BeautifulSoup
from datetime import date

# List of dictionaries containing website information
websites = [
     {
        'url': 'https://www.owl.health/',
        'company': 'Owl health',
        'document_type': 'News',
        'document_format': 'webpage'
    },
    # Add more websites with their respective information
]

target_words = ['a', 'an', 'measurement-based']

csv_file = f'scraped_data_{date.today()}.csv'

# Initialize dictionaries to store segregated data
document_types = {}
document_formats = {}

# Function to determine document type based on title and text
def determine_document_type(title, text):
    title_lower = title.lower()
    text_lower = text.lower()
    # Check for specific patterns or keywords to determine the document type
    if 'blog' in title_lower or 'blog' in text_lower:
        return 'blog'
    elif 'news' in title_lower or 'news' in text_lower:
        return 'News'
    elif 'whitepaper' in title_lower or 'whitepaper' in text_lower:
        return 'whitepaper'
    elif 'press release' in title_lower or 'press release' in text_lower:
        return 'press_release'
    elif 'company info' in title_lower or 'company info' in text_lower:
        return 'company_info'
    elif 'product info' in title_lower or 'product info' in text_lower:
        return 'product_info'
    elif 'case study' in title_lower or 'case study' in text_lower:
        return 'case_study'
    elif 'who we serve' in title_lower or 'who we serve' in text_lower:
        return 'who_we_serve'
    elif 'faq' in title_lower or 'faq' in text_lower:
        return 'faq'
    elif 'team' in title_lower or 'team' in text_lower:
        return 'team'
    else:
        return 'unknown'

# Function to determine document format based on website URL
def determine_document_format(url):
    if url.endswith('.pdf'):
        return 'pdf'
    else:
        return 'webpage'

# Iterate over the websites
for website in websites:
    # Send a GET request to the webpage
    response = requests.get(website['url'])

    # Create a BeautifulSoup object to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the relevant elements on the webpage and extract the desired information
    titles = soup.find_all('h1')
    texts = soup.find_all('p')

    # Extract the text from the elements
    titles_text = [title.get_text().strip() for title in titles]
    texts_text = [text.get_text().strip() for text in texts]

    # Combine the extracted data into rows
    rows = zip([website['company']] * len(titles_text),
               [website['document_format']] * len(titles_text),
               titles_text, texts_text)

    # Add the rows to the segregated data dictionaries
    for row in rows:
        company, document_format, title, text = row
        document_type = determine_document_type(title, text)
        if document_type not in document_types:
            document_types[document_type] = []
        document_types[document_type].append(row)

        if document_format not in document_formats:
            document_formats[document_format] = []
        document_formats[document_format].append(row)

# Write the scraped data to the CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Company', 'Document Type', 'Document Format', 'Title', 'Text'])
    for doc_type, rows in document_types.items():
        writer.writerow([])
        writer.writerow([f"Document Type: {doc_type}"])
        writer.writerow(['Company', 'Document Type', 'Document Format', 'Title', 'Text'])
        for row in rows:
            writer.writerow(row)

    writer.writerow([])
    writer.writerow(['------------------------'])
    writer.writerow([])

    for doc_format, rows in document_formats.items():
        writer.writerow([])
        writer.writerow([f"Document Format: {doc_format}"])
        writer.writerow(['Company', 'Document Type', 'Document Format', 'Title', 'Text'])
        for row in rows:
            writer.writerow(row)

print(f"Scraped data saved in {csv_file} file.")

pickle_file = f'scraped_data_{date.today()}.pickle'

# Read the CSV data
data = []
with open(csv_file, 'r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header row
    for row in reader:
        data.append(row)

# Save the data as a pickle file
with open(pickle_file, 'wb') as file:
    pickle.dump(data, file)

print(f"Scraped data saved in {pickle_file} file.")


Scraped data saved in scraped_data_2023-07-14.csv file.
Scraped data saved in scraped_data_2023-07-14.pickle file.
