## Web-Scraping For Travel Triangle Website

In [4]:
# Importing necessary libraries
import requests  # For sending HTTP requests
from bs4 import BeautifulSoup  # For parsing HTML content
import pandas as pd
import re  # used to perform various operations such as matching, searching, and replacing patterns in strings.
import requests_cache  # For caching HTTP requests

# URL of the TravelTriangle page for restaurants in San Francisco
URL = 'https://traveltriangle.com/blog/restaurants-in-san-francisco/'

# Setting up caching
requests_cache.install_cache('restaurant_cache', expire_after=432000)


# In the below function scrape_traveltriangle_restaurants, we initially fetch
# the webpage using requests.get() with a specified User-Agent to mimic a
# browser request.If the request fails, it catches the error and prints a message,
# returning an empty DataFrame. Further, if the request is successful, it parses
# the HTML content with BeautifulSoup for further extraction of restaurant
# details.Lastly, it initializes an empty list restaurants where the extracted
# restaurant data will be stored.

def scrape_traveltriangle_restaurants(limit=10):

    try:
        response = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
        response.raise_for_status()  # Check if the HTTP request was successful (status code 200)
    except requests.RequestException as e:
        print(f"Failed to retrieve the webpage: {e}")
        return pd.DataFrame()

    # Parse HTML content with BeautifulSoup for efficient extraction
    soup = BeautifulSoup(response.text, 'html.parser')

    # Initialize an empty list to store restaurant data
    restaurants = []


# Optimization:
# we have used the try-except for ensuring that the ensures that if there is
# an error with one restaurant (e.g., missing address or description), it
# doesn't interrupt the entire scraping process. Instead, the scraper moves
# on to the next restaurant, maintaining efficiency and resilience. Extract
# restaurant names and details
# We have also used limit to ensures that the loop will only search through
# the first three sibling <p> tags after the address, preventing unnecessary
# parsing of additional HTML content.


    # Extract restaurant names and details efficiently, limiting the result by the 'limit' parameter
    restaurant_headings = soup.find_all('h3')[:limit]  # Only fetch the first 'limit' restaurant headings

    # Loop through each restaurant heading to extract the details
    for heading in restaurant_headings:
        try:
            # Extract restaurant name from the <h3> tag
            name = heading.get_text(strip=True)
            # Extract description from the next sibling <p> tag if available
            description = heading.find_next_sibling('p')
            description_text = description.get_text(strip=True) if description else 'No details available'
            # Extract address from the next <p> tag after description
            address = description.find_next_sibling('p') if description else None
            address_text = address.get_text(strip=True) if address else 'NA'
            opening_hours = []
            if address:
                for sibling in address.find_next_siblings('p', limit=3):
                    text = sibling.get_text(strip=True)
                    if ':' in text:
                        opening_hours.append(text)

            # Clean up and format the address
            formatted_address = re.sub(r'\s+', ' ', address_text).strip()

            # Append the restaurant data to the list
            restaurants.append({
                'Restaurant Name': name,
                'Description': formatted_address,
                'Address': '\n'.join(opening_hours) if opening_hours else 'NA'
            })
        except AttributeError as e:
            # In case of missing data or structure changes, skip this entry and continue with the next
            print(f"Error processing restaurant entry: {e}")
            continue

    # Convert the list of dictionaries to a pandas DataFrame for easy handling
    df = pd.DataFrame(restaurants)

    # Return the resulting DataFrame
    return df

# Running the scraper with a limit of 17 restaurants
df_restaurants = scrape_traveltriangle_restaurants(limit=17)

# Display the resulting DataFrame if data is scraped successfully, else notify the user
if not df_restaurants.empty:
    print("Restaurants suggested based on scraping data from Traveltriangle:",df_restaurants)
else:
    print("No restaurant data found.")


Restaurants suggested based on scraping data from Traveltriangle:                                       Restaurant Name  \
0                                        1. Zuni Cafe   
1           Looking To Book An International Holiday?   
2                                      2. Cliff House   
3                                   3. Foreign Cinema   
4                                4. Swan Oyster Depot   
5                               5. House Of Prime Rib   
6   Planning your holiday but confused about where...   
7                        6. Brenda’s French Soul Food   
8                                 7. House Of Nanking   
9                                       8. Del Popolo   
10                           9. State Bird Provisions   
11                            10. Liholiho Yacht Club   
12                                    11. Petit Crenn   
13                                           12. Benu   
14                            13. Tartine Manufactory   
15                    

## Web-Scraping for Wikipedia Page

In [6]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Set up caching
requests_cache.install_cache('restaurant_cache', expire_after=432000)

# In the function scrap_top_restaurants, we initially send an HTTP GET request
# to the Wikipedia URL, to which the response.status_code checks if the HTTP
# request was successful or has failed to retrieve data. Finally uses
# BeautifulSoup,to parse the HTML content of the page.

def scrape_top_restaurants(wikipedia_url):
    # Send a request to the Wikipedia page
    response = requests.get(wikipedia_url)
    if response.status_code != 200:
        print("Failed to retrieve the page")
        return pd.DataFrame()

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

# This section of the code is designed to gather restaurant details, such as
# their name, cuisine, and address, from the rows of a table on the
# Wikipedia page. It first looks for any tables marked as wikitable, which
# are commonly used for structured data. Once it finds the relevant table,
# the code skips over the header row and begins processing each data row.
# For each row, it checks if there are enough columns (at least three) to
# ensure there’s sufficient information. If the row contains the right
# amount of data, it extracts the restaurant's name from the first column,
# the cuisine from the second, and the address from the third. Each set of
# extracted details is then added to a list, which will eventually be used
# to create a structured dataset

    # Find the first table (adjust if necessary based on the page structure)
    tables = soup.find_all('table', {'class': 'wikitable'})
    if not tables:
        print("No tables found on the page.")
        return pd.DataFrame()

    data = []
    for table in tables:
        rows = table.find_all('tr')[1:]  # Skip header row
        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 3:  # Ensure there are enough columns
                restaurant_name = cols[0].text.strip()  # First column: Restaurant Name
                Cuisines = cols[1].text.strip()  # Second column: Address
                Address = cols[2].text.strip()  # Third column: Michelin Star
                data.append([restaurant_name, Cuisines, Address])

    # Create DataFrame
    df = pd.DataFrame(data, columns=["Restaurant Name", "Cuisines", "Address"])
    return df

# Example usage
wikipedia_url = "https://en.wikipedia.org/wiki/List_of_Michelin-starred_restaurants_in_the_San_Francisco_Bay_Area_and_Northern_California"  # Change URL as needed
top_restaurants_df = scrape_top_restaurants(wikipedia_url)
print("Top Restaurants suggested from scraping Wikipedia's page are:")
print(top_restaurants_df)


Top Restaurants suggested from scraping Wikipedia's page are:
    Restaurant Name                 Cuisines  \
0           7 Adams              Californian   
1        Acquerello                  Italian   
2             Adega  Portuguese, Californian   
3        AL's Place              Californian   
4            Angler    Contemporary, Seafood   
..              ...                      ...   
177         Trevese             New American   
178          Ubuntu               Vegetarian   
179            Wako                 Japanese   
180        Wakuriya                 Japanese   
181            [18]                     [19]   

                                Address  
0             San Francisco – Japantown  
1            San Francisco – Polk Gulch  
2                              San Jose  
3      San Francisco – Mission District  
4    San Francisco – Financial District  
..                                  ...  
177                           Los Gatos  
178                      