# Ingreduce: Data compilation

The recipe data for our web application were collected from the website allrecipes.com. The data collection was done in two steps. First, I collected the urls of recipes from allrecipes.com. Then, I collected each recipe's information from the urls collected.

Note that the source where those recipe urls were found, is a file that I created. It contains adresses (that I had found) of files containing the website's recipe urls.

## Collecting the recipe urls

```Python
# First, run "jupyter notebook --NotebookApp.iopub_data_rate_limit=1e10" in Terminal.

import pickle
import requests
from bs4 import BeautifulSoup

def found_urls(soup):
    '''Returns a list of all recipe urls found'''
    urls_list = []
    for loc in soup.find_all('loc'):
        url = loc.text
        if url.startswith('https://www.allrecipes.com/recipe/') == True:
            urls_list.append(url)
    return urls_list

# Loading a list of files containing the website's recipe urls.

filename = 'files_list'
infile = open(filename,'rb')
files_list = pickle.load(infile)

# Appending urls found in files to a list

recipe_urls = []

for file in files_list:
    source = requests.get(file).text
    soup = BeautifulSoup(source, 'xml')
    recipe_urls.extend(found_urls(soup))

infile.close()

# Removing duplicates
recipe_urls = list(set(recipe_urls))

# Saving the recipe urls list as pickle

filename = 'recipe_urls'
outfile = open(filename,'wb')
pickle.dump(recipe_urls,outfile)
outfile.close()
```

The outcome is a list of recipe urls saved in a pickle file. Next, we will iterate through each element of that list (that is, each recipe url) to find the corresponding recipe information.

## Collecting the recipes' information

First, I defined the functions that would scrape the recipes' information.

In [None]:
import pickle
import pandas as pd
import requests
from bs4 import BeautifulSoup 
import time

In [None]:
def get_name(soup):
    ''' Gets the name of recipe from a parsed url object.'''
    
    name = None
    
    try:
        title_tag = soup.title
        name = title_tag.text.split('|')[0].strip()
    
    except Exception:
        pass

    return name




def get_description(soup):
    ''' Gets the description of recipe from a parsed url object.'''

    description = None
    
    try:
        summary_tag = soup.find('div', class_='recipe-summary elementFont__dek--within')
        description = summary_tag.find('p', class_='margin-0-auto').text.strip()
    
    except Exception:
        pass
    
    return description



def get_meta(soup):
    ''' Gets some meta information of recipe from a parsed url object.'''
    
    prep = None
    cook = None
    additional = None
    total = None
    servings = None
    yield_ = None
    
    try:
        container_tag = soup.find('section', class_='recipe-meta-container two-subcol-content clearfix recipeMeta')

        for item_tag in container_tag.find_all('div', class_='recipe-meta-item'):
            category_name = item_tag.find('div', class_='recipe-meta-item-header elementFont__subtitle--bold elementFont__transformCapitalize').text
            meta_data = item_tag.find('div', class_='recipe-meta-item-body elementFont__subtitle').text.strip()

            if category_name == 'prep:':
                prep = meta_data
            if category_name == 'cook:':
                cook = meta_data
            if category_name == 'additional:':
                additional = meta_data
            if category_name == 'total:':
                total = meta_data
            if category_name == 'Servings:':
                servings = meta_data
            if category_name == 'Yield:':
                yield_ = meta_data
    
    except Exception:
        pass
    
    return prep, cook, additional, total, servings, yield_




def get_ingredients(soup):
    ''' Gets the ingredient list of recipe from a parsed url object.'''
    
    ingredients = None
    
    try:
        container_tag = soup.find('section', class_='component recipe-ingredients recipeIngredients container interactive')
        
        ingredient_list = []
    
        for item_tag in container_tag.find_all('span', class_='ingredients-item-name elementFont__body'):
            formatted_ingredient = item_tag.text.strip()
            ingredient_list.append(formatted_ingredient)

        if len(ingredient_list) > 1:
            ingredients = '; '.join(ingredient_list)
        if len(ingredient_list) == 1:
            ingredients = ingredient_list[0]
    
    except Exception:
        pass
    
    return ingredients




def get_instructions(soup):
    ''' Gets the instructions for recipe from a parsed url object.'''
    
    instructions = None
    
    try:
        section_tag = soup.find('ul', class_='instructions-section')

        instruction_list = []

        for p_tag in section_tag.find_all('p'):
            instruction_list.append(p_tag.text)
            
        if len(instruction_list) > 1:
            instructions = ' '.join(instruction_list)
        if len(instruction_list) == 1:
            instructions = instruction_list[0]
    
    except Exception:
        pass
    
    return instructions




def get_nutrition(soup):
    ''' Gets the nutrition facts per serving of recipe from a parsed url object.'''
    
    nutrition = None
    
    try:
        section_tag = soup.find('div', class_='recipeNutritionSectionBlock')
        body_tag = section_tag.find('div', class_='section-body')
        nutrition = body_tag.text.split('Full')[0].strip()

    except Exception:
        pass
    
    return nutrition




def get_images_url(soup):
    ''' Gets the url of recipe's images from a parsed url object.'''
    
    images_url = None
    
    # Saving the urls found in a list first, and joining them later if needed
    list_images_url = []
    
    
    # Searching images in location 1
    try: 
        aside_tag = soup.find('aside', class_='recipe-tout-image recipe-info-items-3')
        
        # Look for image of aspect_1x1 
        div_tag = aside_tag.find('div', class_='component lazy-image lazy-image-udf aspect_1x1 cache-only align-default')
        
        # If not found, look for image of aspect_3x2
        if div_tag == None:
            div_tag = aside_tag.find('div', class_='component lazy-image lazy-image-udf aspect_3x2 cache-only align-default')
        
        list_images_url.append(div_tag['data-src'])
    
    except Exception:
        pass
    
    
    # Searching images in location 2
    try:
        image_filmstrip = soup.find('div', class_='component image-filmstrip')

        for image_slide in image_filmstrip.find_all('div', class_='image-slide')[1:]:
            a_tag = image_slide.find('a', class_='ugc-photos-link')
            
            # Look for image of aspect_3x2
            div_tag = a_tag.find('div', class_='component lazy-image lazy-image-udf aspect_3x2')
            
            # If not found, look for image of aspect_3x4
            if div_tag == None:
                div_tag = a_tag.find('div', class_='component lazy-image lazy-image-udf aspect_3x4')
            
            # If not found, look for image of aspect_1x1
            if div_tag == None:
                div_tag = a_tag.find('div', class_='component lazy-image lazy-image-udf aspect_1x1')

            list_images_url.append(div_tag['data-src'])
    
    except Exception:
        pass

    
    # Joining the urls in the list if applicable
    
    if len(list_images_url) > 1:
        images_url = '; '.join(list_images_url)
    if len(list_images_url) == 1:
        images_url = list_images_url[0]

    return images_url

In [None]:
def recipeinfodict(recipe_url):
    ''' Returns a dictionary of the recipe information found on a given allrecipes recipe webpage.'''
    
    recipe_info = {'name': None,
                   'description': None,
                   'prep': None,
                   'cook': None,
                   'additional': None,
                   'total': None,
                   'servings': None,
                   'yield': None,
                   'ingredients': None,
                   'instructions': None,
                   'nutrition': None,
                   'images_url': None
                  }
    
    # Proceeding with scraping only if the request works
    
    r = requests.head(recipe_url)
    
    if r.status_code == 200:
        source = requests.get(recipe_url).text
        soup = BeautifulSoup(source, 'lxml')

        
        # Combining all of the information into a dict

        recipe_info = {'name': get_name(soup), 
                       'description': get_description(soup), 
                       'prep': get_meta(soup)[0], 
                       'cook': get_meta(soup)[1], 
                       'additional': get_meta(soup)[2], 
                       'total': get_meta(soup)[3], 
                       'servings': get_meta(soup)[4], 
                       'yield': get_meta(soup)[5], 
                       'ingredients': get_ingredients(soup), 
                       'instructions': get_instructions(soup), 
                       'nutrition': get_nutrition(soup), 
                       'images_url': get_images_url(soup)
                      }
    
    return recipe_info

Now we can use the recipe urls list and start collecting the recipes' information in a DataFrame.

```Python

# Loading the recipe urls list

filename = 'recipe_urls'
infile = open(filename,'rb')
recipe_urls = pickle.load(infile)


# Creating a DataFrame that will store the scraped information

headers = ['recipe_link', 'name', 'description', 'prep', 'cook', 'additional', 'total', 'servings', 'yield', 'ingredients', 'instructions', 'nutrition', 'images_url']
df = pd.DataFrame(columns=headers)

df['recipe_link'] = recipe_urls
```

While I could have run the following code block and got the entire dataset in one go (with `range(df.shape[0])`), I split the scraping into batches of 1000. There was two reasons for this:
- I wanted to frequently save the scraped data.
- Additionally, it allowed me to manually check whether my scraping code would work with all the recipe urls, as the website has been up for two decades and may contain inconsistent formatting, moved urls, etc. 

```Python

# Inputting recipe information into the DataFrame from the urls of the list

for index in range(1000):
    df.loc[index, 'name'] = recipeinfodict(df.loc[index, 'recipe_link'])['name']
    df.loc[index, 'description'] = recipeinfodict(df.loc[index, 'recipe_link'])['description']
    df.loc[index, 'prep'] = recipeinfodict(df.loc[index, 'recipe_link'])['prep']
    df.loc[index, 'cook'] = recipeinfodict(df.loc[index, 'recipe_link'])['cook']
    df.loc[index, 'additional'] = recipeinfodict(df.loc[index, 'recipe_link'])['additional']
    df.loc[index, 'total'] = recipeinfodict(df.loc[index, 'recipe_link'])['total']
    df.loc[index, 'servings'] = recipeinfodict(df.loc[index, 'recipe_link'])['servings']
    df.loc[index, 'yield'] = recipeinfodict(df.loc[index, 'recipe_link'])['yield']
    df.loc[index, 'ingredients'] = recipeinfodict(df.loc[index, 'recipe_link'])['ingredients']
    df.loc[index, 'instructions'] = recipeinfodict(df.loc[index, 'recipe_link'])['instructions']
    df.loc[index, 'nutrition'] = recipeinfodict(df.loc[index, 'recipe_link'])['nutrition']
    df.loc[index, 'images_url'] = recipeinfodict(df.loc[index, 'recipe_link'])['images_url']
    print(df.loc[index, 'recipe_link'])
    time.sleep(1)

# Saving the DataFrame of the data scraped

filename = 'df1000'
outfile = open(filename,'wb')
pickle.dump(df,outfile)
outfile.close()

# Opening the DataFrame of the data scraped

filename = 'df1000'
infile = open(filename,'rb')
df = pickle.load(infile)

# Inputting recipe information into the DataFrame previously saved

for index in range(1000, 2000):
    df.loc[index, 'name'] = recipeinfodict(df.loc[index, 'recipe_link'])['name']
    df.loc[index, 'description'] = recipeinfodict(df.loc[index, 'recipe_link'])['description']
    df.loc[index, 'prep'] = recipeinfodict(df.loc[index, 'recipe_link'])['prep']
    df.loc[index, 'cook'] = recipeinfodict(df.loc[index, 'recipe_link'])['cook']
    df.loc[index, 'additional'] = recipeinfodict(df.loc[index, 'recipe_link'])['additional']
    df.loc[index, 'total'] = recipeinfodict(df.loc[index, 'recipe_link'])['total']
    df.loc[index, 'servings'] = recipeinfodict(df.loc[index, 'recipe_link'])['servings']
    df.loc[index, 'yield'] = recipeinfodict(df.loc[index, 'recipe_link'])['yield']
    df.loc[index, 'ingredients'] = recipeinfodict(df.loc[index, 'recipe_link'])['ingredients']
    df.loc[index, 'instructions'] = recipeinfodict(df.loc[index, 'recipe_link'])['instructions']
    df.loc[index, 'nutrition'] = recipeinfodict(df.loc[index, 'recipe_link'])['nutrition']
    df.loc[index, 'images_url'] = recipeinfodict(df.loc[index, 'recipe_link'])['images_url']
    print(df.loc[index, 'recipe_link'])
    time.sleep(1)

# Saving the DataFrame of the data scraped

filename = 'df2000'
outfile = open(filename,'wb')
pickle.dump(df,outfile)
outfile.close()
```

And so on, until the 54416 urls found have been scraped. Or at least this was the plan. Time constraint and a slow internet connection only allowed me to scrape **4840** recipes.