In [1]:
import requests
from bs4 import BeautifulSoup

## General Experimentation

In [2]:
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
page

<Response [200]>

In [3]:
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [4]:
len(list(soup.children))

3

In [5]:
list(soup.children)[0]

'html'

In [6]:
list(soup.children)[1]

'\n'

In [7]:
list(soup.children)[2]

<html>
<head>
<title>A simple example page</title>
</head>
<body>
<p>Here is some simple content for this page.</p>
</body>
</html>

In [8]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

- The `Doctype` object contains information about the type of the document.
- The `NavigableString` object represents text found in the HTML document.
- The `Tag` object contains other nested tags.

In [9]:
tags = list(soup.children)[2]

In [10]:
list(tags)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [11]:
body = list(tags.children)[3]

In [12]:
type(body)

bs4.element.Tag

In [13]:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [14]:
list(body.children)[1].get_text()

'Here is some simple content for this page.'

In [15]:
some_list = ['abc-123', 'def-456', 'ghi-789', 'abc-456']
matching = [s for s in some_list if "abc" in s]
matching

['abc-123', 'abc-456']

## Applying to Publix

In [2]:
publixProductUrl = "http://publix.com/pd/grape-tomatoes/RIO-PCI-108357"

### Experimenting

In [8]:
# Store location for the search
butlerStoreCookie = "%7B%22StoreName%22%3A%22Butler%20Plaza%20West%22%2C%22StoreNumber%22%3A1312%2C%22Option%22%3A%22ACDFJNORU%22%2C%22ShortStoreName%22%3A%22Butler%20Plaza%20West%22%7D"

In [9]:
# Set location by sending a cookie through a session
publixSession = requests.Session()
publixSession.post(publixProductUrl, cookies = {'Store': butlerStoreCookie})

<Response [200]>

In [10]:
publixPageRequest = publixSession.get(publixProductUrl)
publixPageParsed = BeautifulSoup(publixPageRequest.content, 'html.parser')

In [11]:
productLocationList = publixPageParsed.findAll("li", {"class" : "location"})

In [12]:
productLocationList[0].find("span").get_text()

'Produce'

In [13]:
storeJSON = {"StoreName":"Butler Plaza West","StoreNumber":1312,"Option":"ACDFJNORU","ShortStoreName":"Butler Plaza West"}

In [14]:
session = requests.Session()
session.post(publixProductUrl, json = storeJSON)

<Response [200]>

In [15]:
request = session.get(publixProductUrl)
page = BeautifulSoup(request.content, 'html.parser')

In [16]:
page.findAll("li", {"class" : "location"})

[]

### Searching for an item

In [51]:
PUBLIX_API_URL = "https://services.publix.com/api/v3/product/Search?storeNumber=537&keyword=tomato"

In [53]:
publix_api_response = requests.get(PUBLIX_API_URL)

In [67]:
products = publix_api_response.json()["Products"]

In [69]:
products[0]

{'Productid': 'RIO-PCI-108400',
 'itemcode': '25470',
 'title': 'Campari Tomatoes',
 'shortdescription1': None,
 'shortdescription2': None,
 'limitedquantitymsg': None,
 'priceline1': None,
 'priceline2': None,
 'displaytype': 'UQ1',
 'savingmsg': None,
 'validthrumsg': None,
 'onsalemsg': None,
 'rssinfo': '205',
 'rsslocation': 'Produce',
 'productimages': 'https://cutpcdnwimages.azureedge.net/images/products/25000/025470-75x75-A.jpg',
 'productimagesxl': 'https://cutpcdnwimages.azureedge.net/images/products/25000/025470-600x600-A.jpg',
 'activationstatus': 'A',
 'productmoreinfo': None,
 'specialpromotion': None,
 'age': 0,
 'sizeDescription': '16 oz Container',
 'nutritionalDescription': None,
 'advertising': False,
 'MarketingImages': [{'ImageURL': 'https://cutpcdnwimages.azureedge.net/images/static/MarketingIndicator/Natural-96x65.png',
   'Title': 'Made w/o artificial flavors, colors, preservatives &ndash; may contain colors from natural sources.',
   'Url': None}],
 'NoImageUrl

In [59]:
len(publix_api_response.json()["Products"])

20

In [73]:
for item in publix_api_response.json()["Products"]:
    print(item['fauxTaxonomy'][0] + item["title"] + " " + item["itemcode"])

Produce/TomatoesCampari Tomatoes 25470
Produce/TomatoesPlum Tomatoes 23332
Produce/TomatoesBeefsteak Tomato 25319
Produce/TomatoesGrape Tomatoes 25436
Produce/TomatoesTomatoes on the Vine 25320
Grocery/Canned Vegetables/TomatoesHunts Tomatoes, Puree 3404
Grocery/Canned Vegetables/TomatoesGia Russa Tomato 1192
Produce/TomatoesVine-Ripe Tomatoes 23325
Produce/Organic/TomatoesOrganic Tomatoes on the Vine 24112
Produce/Diced and Prepared VegetablesIncredibleFresh Tomato, Diced 23733
Produce/Diced and Prepared VegetablesIncredibleFresh Tomato Trinity 25263
Produce/Organic/TomatoesSignature Brand Organics Grape Tomatoes, Organic 25698
Produce/TomatoesVillage Farms Tomatoes, Heavenly Villagio Marzano 955156
Grocery/Canned Vegetables/TomatoesCento Tomato Paste, Organic, Double Concentrated 977443
Grocery/Canned Vegetables/TomatoesMuir Glen Tomatoes, Organic, Fire Roasted, Crushed 919345
Grocery/Canned Vegetables/TomatoesTuttorosso Tomatoes, in Puree, San Marzano Style, Chopped 238555
Grocery/R

In [15]:
def getPublixProductID(itemName):
    url = "https://services.publix.com/api/v3/product/Search?storeNumber=537&keyword=" + itemName
    
    api_response = requests.get(url).json()
    
    if(len(api_response["Products"]) == 0):
        return None
    
    return api_response["Products"][0]["Productid"]

In [16]:
getPublixProductID("cheerios")

'RIO-PCI-126971'

In [17]:
def getPublixLocation(itemName):
    # Store location for the search
    STORE_COOKIE = "%7B%22StoreName%22%3A%22Butler%20Plaza%20West%22%2C%22StoreNumber%22%3A1312%2C%22Option%22%3A%22ACDFJNORU%22%2C%22ShortStoreName%22%3A%22Butler%20Plaza%20West%22%7D"
    
    # Get product ID from Publix
    productID = getPublixProductID(itemName)
    url = "http://publix.com/pd/" + productID
    
    # Set location by sending a cookie through a session.
    session = requests.Session()
    session.post(url, cookies = {'Store': STORE_COOKIE})
    
    # Make get request for product page information.
    request = session.get(url)
    page = BeautifulSoup(request.content, 'html.parser')
    
    # Extract location from page.
    locationTags = page.findAll("li", {"class" : "location"})
    
    return locationTags[0].find("span").get_text()

In [18]:
getPublixLocation("cheerios")

'Aisle 5 - Cereals'

## Extract location from string

In [1]:
testLocation = 'Aisle 5 - Cereals'

In [7]:
def extractLocation(location) :
    split = location.split(' - ')
    if len(split) > 1:
        return {
            "location": split[0],
            "section": split[1],
        }
    else:
        return {
            "location": split[0]
        }

In [8]:
extractLocation(testLocation)

{'location': 'Aisle 5', 'section': 'Cereals'}

In [9]:
extractLocation("Produce")

{'location': 'Produce'}

## Pandas

In [4]:
import json
import pandas as pd

In [34]:
data = [
  {
    "id": 1,
    "name": "tortillas",
    "obtained": False,
    "publixLocation": "Aisle 4",
    "publixSection": "Tortillas/Pita"
  },
  {
    "id": 4,
    "name": "other",
    "obtained": False,
    "publixLocation": "Aisle 4",
    "publixSection": "Tortillas/Pita"
  },
  {
    "id": 2,
    "name": "tomato",
    "obtained": False,
    "publixLocation": "Produce",
    "publixSection": None
  },
  {
    "id": 3,
    "name": "caprisun",
    "obtained": False,
    "publixLocation": "Aisle 2",
    "publixSection": "Fruit Drinks"
  },
    {
    "id": 4,
    "name": "other2",
    "obtained": False,
    "publixLocation": "Aisle 4",
    "publixSection": "Tortillas/Pita"
  },
]

In [35]:
data

[{'id': 1,
  'name': 'tortillas',
  'obtained': False,
  'publixLocation': 'Aisle 4',
  'publixSection': 'Tortillas/Pita'},
 {'id': 2,
  'name': 'tomato',
  'obtained': False,
  'publixLocation': 'Produce',
  'publixSection': None},
 {'id': 3,
  'name': 'caprisun',
  'obtained': False,
  'publixLocation': 'Aisle 2',
  'publixSection': 'Fruit Drinks'}]

In [43]:
def groupByLocation(data, key):
  dataframe = pd.DataFrame(data)
  return dataframe.groupby(key).apply(lambda x: x.to_dict(orient='records'))

In [44]:
groupByLocation(data, 'publixLocation').to_dict()

{'Aisle 2': [{'id': 3,
   'name': 'caprisun',
   'obtained': False,
   'publixLocation': 'Aisle 2',
   'publixSection': 'Fruit Drinks'}],
 'Aisle 4': [{'id': 1,
   'name': 'tortillas',
   'obtained': False,
   'publixLocation': 'Aisle 4',
   'publixSection': 'Tortillas/Pita'}],
 'Produce': [{'id': 2,
   'name': 'tomato',
   'obtained': False,
   'publixLocation': 'Produce',
   'publixSection': None}]}