In [105]:
# Importing libraries

# import sys
# !{sys.executable} -m pip install --upgrade certifi

import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
import time
from IPython.core.display import clear_output
from warnings import warn
from urllib.request import Request, urlopen 


In [156]:
# A function to obtain the list of links to individual recipes depending on cuisine, 
# in order of appearance on the website.

def GetRecipeLinks(cuisine):
    lst = []
    # Scrape 4 pages, 24 recipes on each page
    pages = range(1, 5)
    for i in pages:
        basic_url = 'https://www.deliciousmagazine.co.uk/cuisine/'
        url = basic_url + cuisine + '-recipes/page/' + str(i) +'/'
        # Request, pretending to be Mozilla browser
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        html = urlopen(req, timeout=20).read()
        
        soup = BeautifulSoup(html, "lxml")
        sub_soup = soup.find_all('div', {'class': "recipe-card card-bg"})
        
        page_list = [i.find('a')['href'] for i in sub_soup]
        lst.append(page_list)

        # Pause the loop to be polite to the website
        sleep(randint(7,18))

    # Expand the list
    lst = [i for j in lst for i in j]
    return lst


In [51]:
lst_spanish = GetRecipeLinks('spanish')
print(len(lst_spanish))


120


In [52]:
lst_italian = GetRecipeLinks('italian')
print(len(lst_italian))


96


In [157]:
lst_moroccan = GetRecipeLinks('moroccan')
print(len(lst_moroccan))


96


In [158]:
lst_indian = GetRecipeLinks('indian')
print(len(lst_indian))


96


In [159]:
lst_mexican = GetRecipeLinks('mexican')
print(len(lst_mexican))


96


In [187]:
# Save the lists of links to csv files in case I need them, and to avoid scraping 
# the website again.

links = [lst_spanish, lst_italian, lst_moroccan, lst_indian, lst_mexican]
cuisine = ["Spanish", "Italian", "Morroccan", "Indian", "Mexican"]

for i in range(len(cuisine)):
    df = pd.DataFrame(links[i])
    name = "../data/links_"+cuisine[i]+".csv"
    df.to_csv(name)


In [None]:
# What if I want to go vegan during quarantine?

# https://www.deliciousmagazine.co.uk/dietary_requirements/vegan/
# By tweaking a bit the url formation I extracted the list of 96 vegan links and saved 
# them in a separate csv


In [115]:
# THE LOOP WITH RANDOM PAUSE TO SCRAPE POLITELY

# Sometimes the nutrition values list length is wrong when converting to dataframe, but I
# did not have time to fix this bug. The code is good enough to collect sufficient data for
# further analysis.

def ScrapeParse(list_to_use):
    # Redeclaring the lists to store data in
    titles = []
    ingred_lists = []
    nutrition_lists = []

    # Preparing to monitor the loop
    start_time = time.time()
    requestss = 0

    # Launching the loop
    for recipe in range(len(list_to_use)):
        url = list_to_use[recipe]
        # Make a request, pretend to be Mozilla browser
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        html = urlopen(req, timeout=20).read()

        soup = BeautifulSoup(html, "lxml")

        # Pause the loop
        sleep(randint(11,29))
        
        # Monitor the requests
        requestss += 1
        elapsed_time = time.time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requestss, requestss/elapsed_time))
        # clear_output(wait = True)

        # Break the loop if the number of requests is greater than expected
        if requestss > len(list_to_use):
            warn('Number of requests was greater than expected.')
            break

        # Parse the retrieved content
        soup = BeautifulSoup(html, "lxml")

        # NUTRITIONAL VALUES
        sub_soup1 = soup.find('dl', {'class':"list-two-column style-lined"})
        # TEST FOR EMPTY
        if sub_soup1 == None:
            print("Nutritional values not found")
            nutrition = ["NaN", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN"]
        else:
            sub_soup1 = soup.find('dl', {'class':"list-two-column style-lined"}).find_all("dd")

            nutr = [i.text for i in sub_soup1]

            # Columns should be: Kcal, Fat, SaturatedFat, Protein, Carbohydrates, Sugars, Fibre, Salt 
            # Split Fats and Carbs
            nutrition = [i.split('(') for i in nutr]

            # Removing unwanted characters
            nutrition = [i.replace(")", "") for j in nutrition for i in j]
            nutrition = [i.replace("kcals", "") for i in nutrition]
            nutrition = [i.replace(" sugars", "") for i in nutrition]
            nutrition = [i.replace(" protein", "") for i in nutrition]
            nutrition = [i.replace("sugar", "") for i in nutrition]
            nutrition = [i.replace("g", "") for i in nutrition]
            nutrition = [i.replace(" saturated", "") for i in nutrition]
            nutrition = [i.replace(" satrurated", "") for i in nutrition]
            nutrition = [i.replace(" fat", "") for i in nutrition]
            nutrition = [i.replace("Trace", "") for i in nutrition]
            if len(nutrition) < 8:
                nutrition = ["NaN", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN", "NaN"]
            else:
                nutrition = [i.strip() for i in nutrition]
            
        # LIST OF INGREDIENTS
        sub_soup2 = soup.find('div', {'class':"recipe-ingredients text-standard"})
        # TEST FOR EMPTY
        if sub_soup2 == None:
            print("Ingredients not found")
            ingredients = "NaN"
        else:
            sub_soup2 = soup.find('div', {'class':"recipe-ingredients text-standard"}).find_all("li")
            ingred = [i.text for i in sub_soup2]
            ingredients = ", ".join(ingred)

        # RECIPE TITLE
        title = soup.find('h1').text
        
        ingred_lists.append(ingredients)
        titles.append(title)
        nutrition_lists.append(nutrition)

    return titles, ingred_lists, nutrition_lists


In [150]:
# Function to save the scraped data into files

# links = [lst_spanish, lst_italian, lst_moroccan, lst_indian, lst_mexican]
# cuisine = ["Spanish", "Italian", "Morroccan", "Indian", "Mexican"]

def SaveToCsv(cuisine, recipe_list):
    j = round(len(recipe_list))
    i = 0
    for k in range(j):
        titles, ingred_lists, nutrition_lists = ScrapeParse(recipe_list[i:i+20])
        df = pd.DataFrame()
        df["Title"] = titles
        df["Ingredients"] = ingred_lists
        df[["Kcal", "Fat", "SaturatedFat", "Protein", "Carbohydrates", "Sugars", "Fibre", "Salt"]] = pd.DataFrame(nutrition_lists)
        df["Country"] = cuisine
        # Saving dataframe to csv file
        name = "../data/recipes_"+cuisine+str(k)+".csv"
        df.to_csv(name)
        i += 20


In [152]:
# Creating the dataframes for Spanish data.
# Scraping and saving batches of 20 recipes at a time. It is easier to prevent already 
# scraped data loss, monitor and spot errors this way.
# 120 Spanish recipes

cuisine_name = "Spain"

SaveToCsv(cuisine_name, lst_spanish)


Request:1; Frequency: 0.0813509469483251 requests/s
Request:2; Frequency: 0.05281190382173646 requests/s
Request:3; Frequency: 0.05150055717624027 requests/s
Request:4; Frequency: 0.0525153796447619 requests/s
Request:5; Frequency: 0.0546449807462451 requests/s
Request:6; Frequency: 0.051795073890554974 requests/s
Request:7; Frequency: 0.05296305278369173 requests/s
Request:8; Frequency: 0.054175576510788984 requests/s
Nutritional values not found
Request:9; Frequency: 0.05172219330516067 requests/s
Nutritional values not found
Request:10; Frequency: 0.05015903723001508 requests/s
Request:11; Frequency: 0.04939791175105149 requests/s
Request:12; Frequency: 0.05020379389767321 requests/s
Request:13; Frequency: 0.04834764069401712 requests/s
Request:14; Frequency: 0.049574069338411554 requests/s
Request:15; Frequency: 0.04811676468767325 requests/s
Request:16; Frequency: 0.0492155251674045 requests/s
Nutritional values not found
Request:17; Frequency: 0.050362918917010405 requests/s
Requ

timeout: The read operation timed out

In [124]:
# Creating the dataframes for Italian data.
# Scraping and saving batches of 20 recipes at a time.
# 96 Italian recipes

cuisine_name = "Italy"

SaveToCsv(cuisine_name, lst_italian)


11

In [180]:
# Creating the dataframes for Indian data.
# Scraping and saving batches of 20 recipes at a time.
# 96 Indian recipes

cuisine_name = "India"

SaveToCsv(cuisine_name, lst_indian)



Request:1; Frequency: 0.08819782469545148 requests/s
Request:2; Frequency: 0.04938002538377884 requests/s
Request:3; Frequency: 0.043423476886998276 requests/s
Request:4; Frequency: 0.04936736145034327 requests/s
Request:5; Frequency: 0.04837521773786648 requests/s
Request:6; Frequency: 0.048752420212520506 requests/s
Request:7; Frequency: 0.04662254090015631 requests/s
Request:8; Frequency: 0.046317740198117865 requests/s
Request:9; Frequency: 0.046094186791386826 requests/s
Request:10; Frequency: 0.046714685325650054 requests/s
Request:11; Frequency: 0.0451029548628093 requests/s
Request:12; Frequency: 0.044431037631217536 requests/s
Request:13; Frequency: 0.04516069692708792 requests/s
Request:14; Frequency: 0.04512931724115179 requests/s
Request:15; Frequency: 0.04515931232593566 requests/s
Request:16; Frequency: 0.046180368615808806 requests/s
Request:17; Frequency: 0.04589387278265174 requests/s
Request:18; Frequency: 0.04670988554346093 requests/s
Request:19; Frequency: 0.047788

ValueError: Columns must be same length as key

In [182]:
# Creating the dataframes for Mexican data.
# Scraping and saving batches of 20 recipes at a time.
# 96 recipes

cuisine_name = "Mexican"

SaveToCsv(cuisine_name, lst_mexican)


Request:1; Frequency: 0.05168645204040834 requests/s
Request:2; Frequency: 0.047750228982980404 requests/s
Request:3; Frequency: 0.042959844678129626 requests/s
Request:4; Frequency: 0.045472724257004445 requests/s
Request:5; Frequency: 0.044105520064249996 requests/s
Request:6; Frequency: 0.042255965425130056 requests/s
Request:7; Frequency: 0.04441494621775857 requests/s
Request:8; Frequency: 0.04646252759334787 requests/s
Request:9; Frequency: 0.045551631875425797 requests/s
Request:10; Frequency: 0.044564083571838824 requests/s
Nutritional values not found
Request:11; Frequency: 0.0464468005462128 requests/s
Request:12; Frequency: 0.045245570075817615 requests/s
Request:13; Frequency: 0.046327533585380054 requests/s
Nutritional values not found
Request:14; Frequency: 0.04791138384326254 requests/s
Request:15; Frequency: 0.047293034834047 requests/s
Request:16; Frequency: 0.046090753283032075 requests/s
Request:17; Frequency: 0.045931187901703495 requests/s
Request:18; Frequency: 0.

ValueError: Columns must be same length as key

In [178]:
# Creating the dataframes for vegan data.
# Scraping and saving batches of 20 recipes at a time.
# 96 recipes

lst_vegan = pd.read_csv("../data/links_vegan.csv")
lst_vegan = lst_vegan.values.tolist()
lst_vegan = [i for j in lst_vegan for i in j]

cuisine_name = "vegan"

SaveToCsv(cuisine_name, lst_vegan)



Request:1; Frequency: 0.05308783503856044 requests/s
Request:2; Frequency: 0.04607589757463959 requests/s
Request:3; Frequency: 0.043502192661713394 requests/s
Nutritional values not found
Request:4; Frequency: 0.042328961395187395 requests/s
Request:5; Frequency: 0.041698949279143735 requests/s
Request:6; Frequency: 0.04158171987264154 requests/s
Request:7; Frequency: 0.04072230428484362 requests/s
Request:8; Frequency: 0.041821049097584884 requests/s
Nutritional values not found
Request:9; Frequency: 0.04263120821551376 requests/s
Request:10; Frequency: 0.04191620784254409 requests/s
Request:11; Frequency: 0.04129183007643905 requests/s
Request:12; Frequency: 0.04258736994327269 requests/s
Request:13; Frequency: 0.04372520621609999 requests/s
Request:14; Frequency: 0.04330213925322046 requests/s
Request:15; Frequency: 0.041878967329772435 requests/s
Request:16; Frequency: 0.0429402804023077 requests/s
Request:17; Frequency: 0.042290093653568006 requests/s
Request:18; Frequency: 0.042

ValueError: Columns must be same length as key

In [None]:
# Function to concatenate the dataframes from all csv files into one (by cuisine)

def ToDataFrame(cuisine, file_count, file_name):
    # Creating empty dataframe    
    result = pd.DataFrame(columns=["Title", "Ingredients", "Kcal", "Fat", "SaturatedFat", "Protein", "Carbohydrates", "Sugars", "Fibre", "Salt", "Country"])
    
    # Loop to concatenate dataframes from separate files    
    for i in range(file_count):
        df = pd.read_csv("../data/recipes_"+cuisine="_"str(i)+".csv", index_col="Unnamed: 0").reset_index(drop=True)
        result = pd.concat([result, df], ignore_index=True)
    
    # Saving result to csv file
    result.to_csv("../data/recipes_"+file_name+"_df.csv")
    

In [146]:
# Concatenate the dataframes from all csv files into one

files = 5

ToDataFrame("Italy", files, "Italian")
ToDataFrame("Spain", files, "Spanish")
ToDataFrame("vegan", files, "vegan")
ToDataFrame("India", files, "Indian")
ToDataFrame("Mexican", files, "Mexican")
