In [549]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import pickle

# First we are going to scrape `findmeglutenfree.com` as `gf`

In [329]:
# This is the way we connect to the webpage we are going to scrape
data_gf = requests.get("https://www.findmeglutenfree.com/es/madrid")

In [330]:
# Response code of 200 is good
data_gf.status_code

200

In [331]:
# We are now putting the website throught the BS parser
soup = BeautifulSoup(data_gf.text)

In [332]:
# Here we grabbed all the "div" tags to explore what we get
div = soup.select("div")
type(div)

bs4.element.ResultSet

In [333]:
# the 5th Index of this list contains what we are looking for
len(div)

153

In [334]:
# This is a little more precise but we still don't have exactly what we want
ul = soup.select("ul.list-unstyled.mt-4.mb-4")

In [335]:
# Here is the precise location of the names of our restaruants saved to a variable "name"
name = soup.select("a.align-middle")

In [336]:
# We have to extract the contents contained in these tags and save them
# This is our list of restaurant names
grest = [n.contents for n in name]
len(grest)

30

In [413]:
# This converts our list of lists to a list of items
gf_rest = [name[0] for name in grest]
gf_rest[:5]   

['Taberna La Concha',
 'El Arrozal',
 'Da Nicola Gran Vía',
 'Rosa Negra Madrid',
 'Sana Locura Gluten Free Bakery']

In [343]:
# Here we have the addresses of out restaurants
address = soup.select("span.sl-addr.mt-2")

In [344]:
# Now they have been extracted we will need to combine the names and the addresses in the database
gaddr = [a.contents for a in address]
len(gaddr)

30

In [414]:
# This converts our list of lists to a list of items
gf_addr = [name[0] for name in gaddr]
gf_addr[:5]

['Calle Cava Baja, 7, 28005 Madrid, Spain',
 'Calle Segovia, 13, 28005 Madrid, Spain',
 'Plaza de los Mostenses, 11, 28015 Madrid, Community of Madrid, Spain',
 'Calle del León, 16, 28014 Madrid, Madrid, Spain',
 'Calle del Gral Oraá, 49, 28006 Madrid, Spain']

# Next we are going to scrape `happyceliac.com` as `hc`
**First the name**

In [234]:
# We start the process over again for happyceliac.com
data_hc = requests.get("https://www.happyceliac.com/gluten-free-madrid/")

In [235]:
data_hc

<Response [200]>

In [436]:
soup = BeautifulSoup(data_hc.text)

In [None]:
name = soup.select("h3")

In [301]:
# We had one name at the end of the list that was still nesting bewteen tags 
# We are extracting it from the tags here
rest = [n.contents for n in name]
rest = rest[:-9]
r = [n.contents for n in rest[-1]]

[['Asador Maribel']]

In [419]:
# We are replacing that extracted item back into the list where is was
rest[-1]=r[0]
rest[-5:]

[['La Oriental Sin Gluten ★'],
 ['Celikatessen ★'],
 ['Sana Local ★'],
 ['Bico de Xeado\xa0'],
 ['Asador Maribel']]

In [None]:
# We create a list of items
hc_rest = []
for n in rest:
    line = re.sub(r"^\s|\s$|\s[★]", "", n[0])
    hc_rest.append(line)
hc_rest

**Now the Address**

In [None]:
address = soup.select("em")
address

In [None]:
# There were some unwanted items in this list so we selected the index of the desired lines
addr = [a.contents for a in address[1:-9]]
addr

In [442]:
# and we selected the first index of each list-type item because that contained the address
new = [line[0] for line in addr]
new[:5]

['Calle de Ventura de la Vega, 4, 28014 Madrid, Spain | ',
 'Calle de Echegaray, 5, 28014 Madrid, Spain | ',
 'Calle de Castilla, 62, 28039 Madrid, Spain | ',
 'Plaza Mostenses, 11, 28015 Madrid, Spain | ',
 'Calle de Segovia, 13, 28005 Madrid, Spain | ']

In [None]:
# Here we cleaned the addresses and they look good
hc_addr = []
for l in new:
    line = re.sub(r"\s[|]\s.*", "", l)
    hc_addr.append(line)

In [443]:
print(hc_addr[:5])
len(hc_addr)

['Calle de Ventura de la Vega, 4, 28014 Madrid, Spain', 'Calle de Echegaray, 5, 28014 Madrid, Spain', 'Calle de Castilla, 62, 28039 Madrid, Spain', 'Plaza Mostenses, 11, 28015 Madrid, Spain', 'Calle de Segovia, 13, 28005 Madrid, Spain']


18

# We are going to scrape the `viajarsingluten` as `vsg`

In [484]:
vsg_rest_tot = []
vsg_addr_tot = []

# These are all the next page links
url_list = ["https://www.viajarsingluten.com/restaurantes-sin-gluten-madrid-1_6047/",
            "https://www.viajarsingluten.com/restaurantes-sin-gluten-madrid-1_6047/2/",
            "https://www.viajarsingluten.com/restaurantes-sin-gluten-madrid-1_6047/3/",
            "https://www.viajarsingluten.com/restaurantes-sin-gluten-madrid-1_6047/4/"]


# This loops through each url and grabs what we need and converts it to a list
for url in url_list:
    # Empty nested lists 
    vsg_rest = []
    vsg_addr = []

    # Get url
    print(url)
    data_vsg = requests.get(url)
    soup = BeautifulSoup(data_vsg.text)
    
    # Select specific tags
    names = soup.select("div.nombre")
    address = soup.select("div.dir")
    
    # Select "a" tags within "nombre" tags
    name = [n.select("a") for n in names]

    # Extract contents from tags
    vrest = [n[0].contents for n in name]
    vaddr = [a.contents for a in address]

    # Create a list of items from list of lists
    vsg_rest = [name[0] for name in vrest] 
    vsg_addr = [name[0] for name in vaddr]
    
    # Append alll findings to master list
    vsg_rest_tot.append(vsg_rest)
    vsg_addr_tot.append(vsg_addr)       

In [516]:
# That worked! We need to flatten this list of lists.  
# Not a very elegant solution but...quick...while no one's looking! 
vsg_flat_rest = [y for x in vsg_rest_tot for y in x]
vsg_flat_rest[20:30]

['Vips Méndez Álvaro',
 'Ginos Palacio de Hielo',
 'Hotel Meliá Castilla',
 'Telepizza',
 'Telepizza',
 "Tommy Mel's ",
 'Telepizza',
 'Rodilla',
 'VIPS Moncloa',
 'VIPS Smart Manoteras',
 'Hotel Axor Barajas',
 'Rodilla',
 'Telepizza',
 'Distrito Vegano',
 'Fridays Vaguada',
 'Fit Fat Food',
 'Telepizza',
 'Ginos Caleruega',
 ' Viva Burger',
 'Mc Donald´s']

In [None]:
# There we have eliminated that space in case it causes us problems when searching in Google's API
for index, name in enumerate(vsg_flat_rest): 
    clean = re.sub(r"^\s", "", name)
    vsg_flat_rest[index] = clean    

In [519]:
vsg_flat_rest[20:30]

['Hotel Axor Barajas',
 'Rodilla',
 'Telepizza',
 'Distrito Vegano',
 'Fridays Vaguada',
 'Fit Fat Food',
 'Telepizza',
 'Ginos Caleruega',
 'Viva Burger',
 'Mc Donald´s']

In [523]:
# Now do the same with the addresses: flatten and trim that extra white space
vsg_flat_addr = [y for x in vsg_addr_tot for y in x]
vsg_flat_addr[30:35]

['Princesa 89, Intercambiador Moncloa - 28009 - Madrid (Madrid)',
 'C.E. PARQUE NORTE / C/ Serrano Galvache, 56 - 28033 - Madrid (Madrid)',
 ' Paseo de las Yeserías, 7 dcha - 28005 - Madrid (Madrid)',
 'Montera, 47 - 28013 - Madrid (Madrid)',
 'Pº de la Castellana, 89 - 28046 - Madrid (Madrid)']

In [524]:
for index, addr in enumerate(vsg_flat_addr): 
    clean = re.sub(r"^\s", "", addr)
    vsg_flat_addr[index] = clean    

In [525]:
vsg_flat_addr[30:35]

['Princesa 89, Intercambiador Moncloa - 28009 - Madrid (Madrid)',
 'C.E. PARQUE NORTE / C/ Serrano Galvache, 56 - 28033 - Madrid (Madrid)',
 'Paseo de las Yeserías, 7 dcha - 28005 - Madrid (Madrid)',
 'Montera, 47 - 28013 - Madrid (Madrid)',
 'Pº de la Castellana, 89 - 28046 - Madrid (Madrid)']

# Finally, we need to combine these lists into dictionaries

In [None]:
# These are lists of items
gf_rest
gf_addr
hc_rest
hc_addr

# These are much longer lists that we might use
vsg_flat_rest
vsg_flat_addr

In [532]:
# Lets let's make a function to do the work for us.
gf_list = []

def add_n_a_lists_to_dict(namelist, addresslist, list_dict):
    results = set(zip(namelist,addresslist))
    for tup in results:
        dic = {}
        dic["name"] = tup[0]
        dic["address"] = tup[1]
        list_dict.append(dic)
    return list_dict

In [None]:
add_n_a_lists_to_dict(gf_rest,gf_addr, gf_list)
add_n_a_lists_to_dict(hc_rest,hc_addr, gf_list)

In [539]:
first_places = gf_list

In [541]:
len(first_places)

48

In [528]:
first_places[0]


{'name': 'New York Burger',
 'address': 'Calle del Gral Yagüe, 5, 28020 Madrid, Madrid, Spain'}

In [529]:
list(first_places[1].values())

["Alfredo's Barbacoa Restaurant",
 'Calle de Juan Hurtado de Mendoza, 11, 28036 Madrid, Madrid, Spain']

In [454]:
" ".join(list(first_places[1].values()))

"Alfredo's Barbacoa Restaurant Calle de Juan Hurtado de Mendoza, 11, 28036 Madrid, Madrid, Spain"

In [542]:
gf_list = []

def add_n_a_lists_to_dict(namelist, addresslist, list_dict):
    results = set(zip(namelist,addresslist))
    for tup in results:
        dic = {}
        dic["name"] = tup[0]
        dic["address"] = tup[1]
        list_dict.append(dic)
    return list_dict

In [543]:
second_places = add_n_a_lists_to_dict(vsg_flat_rest,vsg_flat_addr, gf_list)

In [544]:
len(second_places)

244

In [None]:
%store first_places
%store second_places
# ERROR: maximum recursion depth exceeded while pickling an object

In [547]:
# The %store magic works fine here
from sklearn import datasets
dataset = datasets.load_iris()
%store dataset

Stored 'dataset' (Bunch)


-------------

# For some reason pickling didn't work with `first_places` so we cut and paste... 

In [559]:
pickle_out = open("/Users/claytonlouden/projects/final_project/pickled/first_places.pickle","wb")
pickle.dump(first_places, pickle_out)
pickle_out.close()
# maximum recursion depth exceeded while calling a Python object

RecursionError: maximum recursion depth exceeded while calling a Python object

# Save `second_places` 

In [560]:
pickle_out = open("/Users/claytonlouden/projects/final_project/pickled/second_places.pickle","wb")
pickle.dump(second_places, pickle_out)
pickle_out.close()

-----------