<a href="https://colab.research.google.com/github/PaolaMaribel18/RI_2024a/blob/main/week12/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Web Scraping Exercise

## 1. Introduction and Planning

### Objective:
The goal of this exercise is to build a web scraper that collects data from a chosen website. You will learn how to send HTTP requests, parse HTML content, extract relevant data, and store it in a structured format.

### Tasks:
1. Identify the data you want to scrape.
2. Choose the target website(s).
3. Plan the structure of your project.

### Example:
For this exercise, we will scrape job listings from Indeed.com. We will extract job titles, company names, locations, and job descriptions.

## 2. Understanding the Target Website
### Objective:

Analyze the structure of the web pages to be scraped.
### Tasks:

* Inspect the target website using browser developer tools.
* Identify the HTML elements that contain the desired data.

### Instructions:

* Open your browser and navigate to the target website (e.g., Indeed.com).
* Right-click on the webpage and select "Inspect" or press Ctrl+Shift+I.
* Use the developer tools to explore the HTML structure of the webpage.
* Identify the tags and classes of the elements that contain the job titles, company names, locations, and descriptions.

## 3. Writing the Scraper
### Objective:

Develop the code to scrape data from the target website.
### Tasks:

* Send HTTP requests to the target website.
* Parse the HTML content and extract the required data.
* Handle pagination to scrape data from multiple pages.
* Implement error handling.

In [None]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np


In [None]:
# HTTP headers configuration
headers = {
    "User-Agent": "My Web App"
}

In [None]:
# Read HTML content from a local file and parse it with BeautifulSoup
def read_html_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

In [None]:
html_content = read_html_file("/content/drive/MyDrive/week12/allRecipes.html")
soup = BeautifulSoup(html_content, "html.parser")

In [None]:
# Fetch HTML content from a list of URLs
def fetch_html_from_urls(urls_list, headers, timeout=10):
    html_contents = []
    for url in tqdm(urls_list, desc="Downloading"):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            content = BeautifulSoup(response.text, "html.parser")
            html_contents.append(content)
        except requests.Timeout:
            print(f"Timeout occurred for URL: {url}")
        except requests.RequestException as error:
            print(f"Request failed for URL: {url}. Error: {error}")
    return html_contents

In [None]:
# Extract links from an HTML page
def extract_links_from_page(html_elements):
    links = []
    for item in html_elements.find_all("li", class_="comp mntl-link-list__item"):
        anchor = item.find("a", href=True)
        if anchor:
            links.append(anchor['href'])
    return links

In [None]:
# Extract links from a specific page using CSS classes
def extract_links_from_list(page_elements):
    links = []
    for item in page_elements.find_all("a", class_="comp mntl-card-list-items mntl-document-card mntl-card card card--no-image"):
        if item.has_attr('href'):
            links.append(item['href'])
    return links

In [None]:
# Get titles from a list of BeautifulSoup objects
def get_titles_from_pages(pages):
    titles = [page.find("title").text for page in pages]
    return titles


In [None]:
# Get recipe descriptions
def get_recipe_descriptions(pages_recipes):
    descriptions = []
    for page in pages_recipes:
        description = page.find("p", class_="article-subheading type--dog")
        descriptions.append(description.text if description else np.nan)
    return descriptions

In [None]:
# Get recipe ingredients
def get_recipe_ingredients(pages_recipes):
    ingredients = []
    for page in pages_recipes:
        ingredient_text = ''
        for item in page.find_all("li", class_="mm-recipes-structured-ingredients__list-item"):
            ingredient_text += item.text.strip() + '\n'
        ingredients.append(ingredient_text)
    return ingredients

In [None]:
# Get recipe steps
def get_recipe_steps(pages_recipes):
    steps = []
    for page in pages_recipes:
        step_text = ''
        for item in page.find_all("li", class_="comp mntl-sc-block mntl-sc-block-startgroup mntl-sc-block-group--LI"):
            for tag in item.find_all(["figure", "div"]):
                tag.decompose()
            step_text += item.text.strip() + '\n'
        steps.append(step_text)
    return steps


In [None]:
# Extract links from the soup HTML
html_links = extract_links_from_page(soup)

# Download HTML content from each link
recipe_contents = fetch_html_from_urls(html_links, headers)

Downloading: 100%|██████████| 378/378 [03:29<00:00,  1.81it/s]


In [None]:
# Extract links from each recipe page
recipe_links = [extract_links_from_list(page) for page in recipe_contents]
# Combine all recipe page links into a single list
all_links = [link for sublist in recipe_links for link in sublist]


In [None]:
# Download HTML content from each link
full_recipe_contents = fetch_html_from_urls(all_links, headers)


Downloading:  17%|█▋        | 2999/18122 [23:05<1:41:08,  2.49it/s]

In [None]:
# Get recipe titles
recipe_titles = get_titles_from_pages(full_recipe_contents)
recipes_df = pd.DataFrame(recipe_titles, columns=['Title'])

# Get descriptions and add them to the DataFrame
recipe_descriptions = get_recipe_descriptions(full_recipe_contents)
recipes_df['Description'] = recipe_descriptions

# Get ingredients and add them to the DataFrame
recipe_ingredients = get_recipe_ingredients(full_recipe_contents)
recipes_df['Ingredients'] = recipe_ingredients

# Get steps and add them to the DataFrame
recipe_steps = get_recipe_steps(full_recipe_contents)
recipes_df['Steps'] = recipe_steps


In [None]:
# Save the DataFrame to a CSV file
recipes_df.to_csv('Complete_Recipes.csv', index=False)

# Read the CSV file to verify its contents
df = pd.read_csv('Complete_Recipes.csv')
df

In [None]:
df.shape