# 1b. Gather all Recipe data from links

In [1]:
import re

import requests, bs4
from bs4 import BeautifulSoup as bs
import pandas as pd
from pprint import pprint
from fake_useragent import UserAgent

In [2]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

In [3]:
def check_response(url):
    response = requests.get(url, headers = HEADERS)
    status = response.status_code
    if status == 200:
      page = response.text
      soup = bs(page)
    else:
      print(f"Oops! Received status code {status} for {url}")
    
    return soup

In [6]:
def get_ingredients(soup):
    try:
        # Get list of all the html blocks with ingredients 
        ing_list = soup.find_all('li',{'class':'wprm-recipe-ingredient'})

        ingredients = []

        # Get the individual amts, units, and name for each ingredient
        for ingredient in ing_list:
            single_ingredient = {}

            ingredient_name = ingredient.find('span',{'class':'wprm-recipe-ingredient-name'}).get_text()

            try:
                ingredient_amt = ingredient.find('span',{'class':'wprm-recipe-ingredient-amount'}).get_text()
            except:
                ingredient_amt = None

            try:
                ingredient_unit = ingredient.find('span',{'class':'wprm-recipe-ingredient-unit'}).get_text()
            except:
                ingredient_unit = None


            # Add values to ingredient_dict
            single_ingredient['name'] = ingredient_name
            single_ingredient['amount'] = ingredient_amt
            single_ingredient['unit'] = ingredient_unit

            ingredients.append(single_ingredient)

        return ingredients
    
    except:
        return None

def get_instructions(soup):
    try:
        instructions = soup.find_all('li',{'class':'wprm-recipe-instruction'})
        instruction_list = []
        i = 1

        for row in instructions:
            step = '{}. '.format(i) + row.get_text()
            instruction_list.append(step)

            i += 1

        return instruction_list

    except:
        return None

def whats_gaby_cooking(url):
    soup = check_response(url)
    
    # Gathering Recipe Details
    recipe_title = soup.find('h1',{'class':'entry-title'}).get_text()

    try:
        total_time = soup.find('p', {'class': 'header-recipe-time'}).get_text()
    except:
        total_time = None

    try:
        course = soup.find('span', {'class': 'wprm-recipe-course wprm-block-text-normal'}).get_text()
    except: 
        course = None

    try:  
        cuisine = soup.find('span', {'class': 'wprm-recipe-cuisine wprm-block-text-normal'}).get_text()
    except:
        cuisine = None
    
    ingredients = get_ingredients(soup)
    instructions = get_instructions(soup)

    recipe_dict = {}
    recipe_dict['title'] = recipe_title
    recipe_dict['time'] = total_time
    recipe_dict['course'] = course
    recipe_dict['cuisine'] = cuisine
    recipe_dict['ingredients'] = ingredients
    recipe_dict['instructions'] = instructions

    return recipe_dict

In [4]:
import pickle

open_file = open('all_recipe_links.pkl', "rb")
all_recipe_links = pickle.load(open_file)
open_file.close()

print(len(all_recipe_links))
all_recipe_links[:10]

5551


['https://whatsgabycooking.com/spinach-dip/',
 'https://whatsgabycooking.com/lemon-pepper-parmesan-wings/',
 'https://whatsgabycooking.com/queso-fundido/',
 'https://whatsgabycooking.com/100-best-snacks-for-the-super-bowl/',
 'https://whatsgabycooking.com/parmesan-arancini/',
 'https://whatsgabycooking.com/baked-brie/',
 'https://whatsgabycooking.com/marinated-olives-and-feta/',
 'https://whatsgabycooking.com/cucumber-feta-salad/',
 'https://whatsgabycooking.com/zucchini-fritters-with-yogurt-feta-dipping-sauce/',
 'https://whatsgabycooking.com/spicy-roasted-castelvetrano-olives-with-feta/']

In [24]:
import time

rec_per_page = 500
overage = len(all_recipe_links) % rec_per_page
max_page_num = (len(all_recipe_links) // rec_per_page) + 1

recipe_list = []
page_num = 1

issue_links = []

for link in all_recipe_links:
    try:
        recipe_dict = whats_gaby_cooking(link)
        recipe_list.append(recipe_dict)

        if len(recipe_list) == rec_per_page:
            filename = 'recipes_{}'.format(page_num)
            with open('data/{}.pkl'.format(filename), 'wb') as f:
                pickle.dump(recipe_list,f)

            del recipe_list[:]
            page_num += 1
        
        # saving file with final recipes
        if page_num == max_page_num and len(recipe_list) == overage:
            filename = 'recipes_{}'.format(page_num)
            with open('data/{}.pkl'.format(filename), 'wb') as f:
                pickle.dump(recipe_list,f)

        time.sleep(0.1)
        
    except:
        issue_links.append(link)

In [31]:
issue_links

['https://whatsgabycooking.com/lemon-curd-mousse-tart/',
 'https://whatsgabycooking.com/daring-bakers-tiramisu/',
 'https://whatsgabycooking.com/daring-bakers-dobos-torta/',
 'https://whatsgabycooking.com/daring-bakers-mallows-chocolate-covered-marshmallow-cookies/',
 'https://whatsgabycooking.com/daring-bakers-tiramisu/',
 'https://whatsgabycooking.com/tuscan-chicken-pizza-sandwich/',
 'https://whatsgabycooking.com/tuscan-chicken-pizza-sandwich/']

Remove issue links from `all_recipe_links` list.

In [47]:
final_links = [link for link in all_recipe_links if link not in issue_links]

with open('data/final_recipe_links.pkl', 'wb') as f:
    pickle.dump(final_links,f)

#### Data Check

In [32]:
open_file = open('data/recipes_1.pkl', "rb")
recipes_1 = pickle.load(open_file)
open_file.close()
len(recipes_1)

500