In [2]:
import requests
from bs4 import BeautifulSoup
import re
import argparse
import pandas as pd

In [3]:
def scrape_text(url):
    """
    Scrape text content from a given URL.
    
    Args:
        url (str): The URL to scrape
        
    Returns:
        str: The cleaned text content from the webpage
    """
    # Send a GET request to the URL
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes
    except requests.RequestException as e:
        print(f"Error fetching the webpage: {e}")
        return None

    # Create a BeautifulSoup object
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Remove script and style elements
    for script in soup(["script", "style"]):
        script.decompose()
    
    # Get text content
    text = soup.get_text()
    
    # Clean up the text
    text = re.sub(r'\n+', '\n', text)  # Replace multiple newlines with single newline
    text = re.sub(r'\s+', ' ', text)   # Replace multiple spaces with single space
    text = text.strip()                 # Remove leading/trailing whitespace
    
    return text

In [4]:
def process_csv(file_path):
    """
    Process a CSV file and save the results to a new CSV file.
    
    Args:
        file_path (str): The path to the input CSV file

    """

    data = pd.read_csv(file_path)
    for index, row in data.iterrows():
        url = row['Case study link']
        try:
            text = scrape_text(url)
            data.loc[index, 'text'] = text
            print(f"Scraped text from {url}")
        except Exception as e:
            print(f"Error scraping text from {url}: {e}")
            data.loc[index, 'text'] = ''
    
    data.to_csv('data/case_studies_scraped.csv', index=False)

In [5]:
path = 'data/case_studies_links.csv'
process_csv(path)

Scraped text from https://floatbot.ai/case-studies/utility-case-study
Scraped text from  https://www.assembled.com/case-study/autodesk
Error fetching the webpage: 405 Client Error: Not Allowed for url: https://content.aisera.com/case-studies/reputation
Scraped text from https://content.aisera.com/case-studies/reputation
Scraped text from https://www.assembled.com/case-study/brooks-running
Scraped text from https://www.salesloft.com/resources/case-studies/better-leads-with-bionic-chatbots
Scraped text from https://www.assembled.com/case-study/capital-on-tap
Scraped text from https://www.assembled.com/case-study/classpass
Scraped text from https://www.intercom.com/customers/code-for-america
Scraped text from https://www.iadvize.com/en/success-story/vanden-borre
Error fetching the webpage: 404 Client Error: Not Found for url: https://www.assembled.com/case-study/dailypay
Scraped text from https://www.assembled.com/case-study/dailypay
Scraped text from https://chatfuel.com/success-stories/