In [1]:
import requests
from bs4 import BeautifulSoup

## General Experimentation

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [4]:
len(list(soup.children))

3

In [5]:
list(soup.children)[0]

'html'

In [6]:
list(soup.children)[1]

'\n'

In [7]:
list(soup.children)[2]

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [8]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

- The `Doctype` object contains information about the type of the document.
- The `NavigableString` object represents text found in the HTML document.
- The `Tag` object contains other nested tags.

In [9]:
tags = list(soup.children)[2]

In [10]:
list(tags)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [11]:
body = list(tags.children)[3]

In [12]:
type(body)

bs4.element.Tag

In [13]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [14]:
list(body.children)[1].get_text()

'Here is some simple content for this page.'

In [15]:
some_list = ['abc-123', 'def-456', 'ghi-789', 'abc-456']
matching = [s for s in some_list if "abc" in s]
matching

['abc-123', 'abc-456']

## Applying to Publix

In [16]:
publixProductUrl = "http://publix.com/pd/grape-tomatoes/RIO-PCI-108357"

### Experimenting

In [17]:
# Store location for the search
storeValue = "%7B%22StoreName%22%3A%22Butler%20Plaza%20West%22%2C%22StoreNumber%22%3A1312%2C%22Option%22%3A%22ACDFJNORU%22%2C%22ShortStoreName%22%3A%22Butler%20Plaza%20West%22%7D"

In [18]:
# Set location by sending a cookie through a session
publixSession = requests.Session()
publixSession.post(publixProductUrl, cookies = {'Store': storeValue})

<Response [200]>

In [19]:
publixPageRequest = publixSession.get(publixProductUrl)
publixPageParsed = BeautifulSoup(publixPageRequest.content, 'html.parser')

In [20]:
productLocationList = publixPageParsed.findAll("li", {"class" : "location"})

In [21]:
productLocationList[0].find("span").get_text()

'Produce'

### Function

In [24]:
def getPublixLocation(storeCookie, productURL):
    # Set location by sending a cookie through a session.
    session = requests.Session()
    session.post(productURL, cookies = {'Store': storeCookie})
    
    # Make get request for product page information.
    request = publixSession.get(productURL)
    page = BeautifulSoup(request.content, 'html.parser')
    
    # Extract location from page.
    locationTags = publixPageParsed.findAll("li", {"class" : "location"})
    
    return productLocationList[0].find("span").get_text()

In [25]:
getPublixLocation(storeValue, publixProductUrl)

'Produce'