# 1a. Get Recipe Links from WhatsGabyCooking.com

In [1]:
import re

import requests, bs4
from bs4 import BeautifulSoup as bs
import pandas as pd
from pprint import pprint
from fake_useragent import UserAgent

### Step 1. Pulling all of the links to recipe pages

In [2]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

In [4]:
def check_response(url):
    response = requests.get(url, headers = HEADERS)
    status = response.status_code
    if status == 200:
      page = response.text
      soup = bs(page)
    else:
      print(f"Oops! Received status code {status} for {url}")
    
    return soup

In [37]:
def get_category_pages(category_url, category):
    '''
    Argumnets:
    -category_url: the url of a specific category of recipe
    -category: the label of the category 
    
    Returns: a list of links for each category
    '''
    # Set up the response
    category_page_soup = check_response(category_url)
    
    # create list for page_number_links
    category_page_links = []
    category_page_links.append(category_url) # append first page - just the category link
    
    if category == 'sauces': # only 1 page
        pass
    else:
        next_page = category_page_soup.find_all('a',{'class':'page-link'})
        final_page_str = next_page[-2].get_text()
        final_page = int(final_page_str)

        for i in range(2,final_page + 1):
            category_page_links.append(f'https://whatsgabycooking.com/category/categories/{category}/page/{i}')
    
    return category_page_links

def get_category_recipes(url):
    '''
    Arguments:
    -url: The url for a single page in specific category (e.g. page 2 of 14)
        - A single value in the list returned by function get_category_pages (defined above)
    
    Returns: a list of recipe links for that page
    '''
    recipe_soup = check_response(url)
    
    # Get the links to individual recipes
    recipe_html_links = recipe_soup.find_all('h2',{'class':'entry-title archive-entry-title'})
    recipe_links = []
    for row in recipe_html_links:
        link = row.find('a')['href']
        recipe_links.append(link)
        
    return recipe_links

In [38]:
url = 'https://whatsgabycooking.com/category/categories/'
whats_gaby_cooking = check_response(url)

In [39]:
cat_html_links = whats_gaby_cooking.find_all('h2',{'class':'entry-title archive-entry-title'})
categories = []
for row in cat_html_links:
    test = row.find('a')['href']
    categories.append(test)

In [40]:
categories

['appetizerssmall-bites',
 'avocado-central',
 'bowls',
 'bread1',
 'breakfast-and-brunch',
 'snacks',
 'main-course',
 'dessert',
 'beveragecocktail',
 'food-restrictions-allergies',
 'game-day',
 'grilling',
 'healthyfigure-friendly',
 'holiday',
 'lunch',
 'meat',
 'pastarisotto',
 'pizza',
 'poultry-categories',
 'salad',
 'sandwiches-burgers',
 'sauces',
 'seafood',
 'seasonal',
 'side-dish',
 'snacks-categories',
 'soupsandwichburger',
 'tacos',
 'vegetarian']

In [41]:
category_urls = []
for category in categories:
    category_urls.append('https://whatsgabycooking.com/category/categories/' + category)
category_urls

['https://whatsgabycooking.com/category/categories/appetizerssmall-bites',
 'https://whatsgabycooking.com/category/categories/avocado-central',
 'https://whatsgabycooking.com/category/categories/bowls',
 'https://whatsgabycooking.com/category/categories/bread1',
 'https://whatsgabycooking.com/category/categories/breakfast-and-brunch',
 'https://whatsgabycooking.com/category/categories/snacks',
 'https://whatsgabycooking.com/category/categories/main-course',
 'https://whatsgabycooking.com/category/categories/dessert',
 'https://whatsgabycooking.com/category/categories/beveragecocktail',
 'https://whatsgabycooking.com/category/categories/food-restrictions-allergies',
 'https://whatsgabycooking.com/category/categories/game-day',
 'https://whatsgabycooking.com/category/categories/grilling',
 'https://whatsgabycooking.com/category/categories/healthyfigure-friendly',
 'https://whatsgabycooking.com/category/categories/holiday',
 'https://whatsgabycooking.com/category/categories/lunch',
 'http

In [43]:
category_pages = []
for category, url in zip(categories, category_urls):
    category_pages.append(get_category_pages(url,category))
    print(f'{url} completed')

https://whatsgabycooking.com/category/categories/appetizerssmall-bites completed
https://whatsgabycooking.com/category/categories/avocado-central completed
https://whatsgabycooking.com/category/categories/bowls completed
https://whatsgabycooking.com/category/categories/bread1 completed
https://whatsgabycooking.com/category/categories/breakfast-and-brunch completed
https://whatsgabycooking.com/category/categories/snacks completed
https://whatsgabycooking.com/category/categories/main-course completed
https://whatsgabycooking.com/category/categories/dessert completed
https://whatsgabycooking.com/category/categories/beveragecocktail completed
https://whatsgabycooking.com/category/categories/food-restrictions-allergies completed
https://whatsgabycooking.com/category/categories/game-day completed
https://whatsgabycooking.com/category/categories/grilling completed
https://whatsgabycooking.com/category/categories/healthyfigure-friendly completed
https://whatsgabycooking.com/category/categories

In [50]:
# category_pages is list of lists - flatten to get all links in one list
import itertools

all_category_pages = list(itertools.chain(*category_pages))
print(len(all_category_pages))

322


['https://whatsgabycooking.com/category/categories/appetizerssmall-bites',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/2',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/3',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/4',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/5',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/6',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/7',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/8',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/9',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/10',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/11',
 'https://whatsgabycooking.com/category/categories/appetizerssmall-bites/page/12',
 'https://whatsgabyc

In [51]:
all_links = []
for url in all_category_pages:
    page_recipes = get_category_recipes(url)
    all_links.append(page_recipes)
print(len(all_links))

322


['https://whatsgabycooking.com/pao-de-queijo/',
 'https://whatsgabycooking.com/pizza-stuffed-mushrooms/',
 'https://whatsgabycooking.com/hangover-nachos/',
 'https://whatsgabycooking.com/bacon-cotija-guacamole/',
 'https://whatsgabycooking.com/baked-brie-fillo-shells-fig-preserves/',
 'https://whatsgabycooking.com/chipotle-sweet-potato-fries/',
 'https://whatsgabycooking.com/tabasco-bacon-cheddar-wings/',
 'https://whatsgabycooking.com/cheesy-pull-apart-bread/',
 'https://whatsgabycooking.com/mini-chicken-tostadas/',
 'https://whatsgabycooking.com/loaded-nachos/',
 'https://whatsgabycooking.com/whipped-feta/',
 'https://whatsgabycooking.com/spicy-cheesy-guacamole/',
 'https://whatsgabycooking.com/bell-pepper-and-mint-lentil-hummus/',
 'https://whatsgabycooking.com/cheesy-guacamole/',
 'https://whatsgabycooking.com/pepper-jack-stuffed-pretzel-bites/',
 'https://whatsgabycooking.com/party-nachos/',
 'https://whatsgabycooking.com/avocado-prosciutto-crostini/',
 'https://whatsgabycooking.c

In [52]:
all_recipe_links = list(itertools.chain(*all_links))
print(len(all_recipe_links))

5551


In [55]:
import pickle

with open('all_recipe_links.pkl','wb') as f:
    pickle.dump(all_recipe_links, f)