# Imports

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas
import os
import csv

# Variables

In [5]:
link_list = pandas.read_csv('../data/clean_urls_list.csv')
urls = link_list['urls']
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Functions

In [6]:
def get_name(urls_to_iterate) : 
    print('getting city names .', end=' ')
    cities_names = []
    for url in urls_to_iterate : 
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        breadcrumb_nav = soup.find("nav", class_="breadcrumb")
        breadcrumb_items = breadcrumb_nav.find_all("a", class_="breadcrumb_link")
        city_name = breadcrumb_items[2].span.text            
        cities_names.append(city_name)
        print('.', end=' ', flush=True)
    print()
    return cities_names

In [7]:
def get_price(urls_to_iterate, food) :
    print(f'getting {food} prices .', end=' ') 
    food_price = []
    for url in urls_to_iterate : 
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        food_td = soup.select_one(f"td:-soup-contains('{food}')")
        next_td = food_td.find_next_sibling("td")
        price_span = next_td.find('span', class_='first_currency')
        if price_span.text == '?' : 
            food_price.append('Null')
            continue
        price_value = float(price_span.text.replace('$', '').strip())
        food_price.append(price_value)
        print('.', end=' ', flush=True)
    print()
    return food_price

In [8]:
def get_price_per_city(urls, food):
    cities = get_name(urls)
    prices = get_price(urls, food)
    
    if len(cities) != len(prices):
        print("Error, number of cities and prices not equal.")
        return
    
    data = {'City': cities, f'{food} Price': prices}
    df = pandas.DataFrame(data)
    return df

# Data collection

In [9]:
items = {
  0: "Milk",
  1: "Loaf of Fresh White Bread",
  2: "Rice",
  3: "Eggs",
  4: "Local Cheese",
  5: "Chicken Fillets",
  6: "Beef Round",
  7: "Apples",
  8: "Banana",
  9: "Oranges",
  10: "Tomato",
  11: "Potato",
  12: "Onion",
  13: "Lettuce",
  14: "Water",
  15: "Bottle of Wine",
  16: "Domestic Beer",
  17: "Imported Beer",
  18: "Cigarettes 20 Pack"
};

This cell below can be adapted with custom variable name and custom item index to get the data you're interested in. <br>
*(exemple below is for banana and milk)*

In [None]:
banana_price = get_price_per_city(urls, items[8])
milk_price = get_price_per_city(urls, items[0])

# Save Data

In [11]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
output_path = os.path.join(project_root, 'data', 'banana_price.csv')

banana_price.to_csv(output_path, index=False)

In [12]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
output_path = os.path.join(project_root, 'data', 'milk_price.csv')

milk_price.to_csv(output_path, index=False)