# Imports

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from tqdm import tqdm

from selenium import webdriver

Selenium notes:

- https://stackoverflow.com/questions/40555930/selenium-chromedriver-executable-needs-to-be-in-path
- https://able.bio/rhett/web-scraping-with-python-using-beautiful-soup-and-selenium--44jqsra
- https://dirtycoder.net/2018/02/07/using-chrome-in-headless-mode-with-selenium-and-python/
- https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python


# done

## URL UTILS

In [3]:
def url_gen(year, week):
    base_url = f'https://enhver.no/priser'
    
    url = f'{base_url}/{week}/{year}'
   
    return url

In [4]:
#dicts to loop over
y2019 = {i : 2019 for i in range(1,53,1)}
y2020 = {i : 2020 for i in range(1,45,1)}

In [5]:
urls_2019 = []
for w,y in y2019.items():
    urls_2019.append(url_gen(year = y, week = w))

In [6]:
urls_2020 = []
for w,y in y2020.items():
    urls_2020.append(url_gen(year = y, week = w))

In [7]:
urls = urls_2019 + urls_2020

## SINGLE URL FETCH

In [8]:
def fetch_single_url_selenium(url, executeable_path = r'C:\Users\wschupp\Desktop\NOMNCA - retail price scrape\chromedriver.exe'):
    
    # Initialising driver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('headless')
    chrome_options.add_argument('--no-sandbox')
    
    driver = webdriver.Chrome(options = chrome_options)
    driver = webdriver.Chrome(executable_path=executeable_path, options = chrome_options)
    driver.get(url)
    
    #fetching name, id, prices
    contents = _contents(driver.find_elements_by_class_name('product'))
    
    #adding dates to each content in contents
    for i in contents:
        i['year'] = int(url[-4:])
        week = re.search(r'/(\d+)/', string = url).group(1)
        i['week'] = int(week)
    
    # closing browser
    driver.quit()
    
    # into df
    contents_df = pd.DataFrame(contents)
    
    return contents_df
    

In [9]:
def _contents(webelements):
    '''
    Loops over each row
    '''
    container = []
    
    for webelement in webelements:
        container.append(_row_contents(webelement))
    
    return container
    

In [10]:
def _row_contents(webelement):
    
    container = {}
    
    #id
    container['product_id'] = webelement.get_attribute('data-product-id')
    
    #long str with all content
    content_str = webelement.text
    content_list = content_str.split('\n')
    
    #names and type
    container['product_name'] = content_list[0].strip()
    container['product_type'] = content_list[1].strip()
    
    #Prices
    try:
        price_list = content_list[2].split(' ')
        price_list = [float(i) for i in price_list]
    except:
        price_list = [None for i in range(0,6)]
    
    container['price_KIWI'] = price_list[0]
    container['price_MENY'] = price_list[1]
    container['price_OBS'] = price_list[2]
    container['price_REMA 1000'] = price_list[3]
    container['price_SPAR'] = price_list[4]
    container['price_EXTRA'] = price_list[5]

    
    return container
    
    
    

## Loop over all urls

In [11]:
def fetch_multiple_url_selenium(urls, sleep = 5, **kwargs):
    container = []
    
    for url in tqdm(urls):
                    
        container.append(fetch_single_url_selenium(url))
        
        time.sleep(sleep)
        
    container_dfs = pd.concat(container)
        
    return container_dfs
        
    

In [12]:
urls[13]

'https://enhver.no/priser/14/2019'

In [13]:
test = fetch_multiple_url_selenium(urls = urls)

100%|██████████████████████████████████████████████████████████████████████████████████| 96/96 [27:55<00:00, 17.45s/it]


In [14]:
test.to_csv('data_2019_2020.csv', encoding= 'utf-8')

# dev

## URL gens

In [None]:
def url_gen(year, week):
    base_url = f'https://enhver.no/priser'
    
    url = f'{base_url}/{week}/{year}'
   
    return url

In [None]:
#dicts to loop over
y2019 = {i : 2019 for i in range(1,53,1)}
y2020 = {i : 2020 for i in range(1,44,1)}

In [None]:
urls_2019 = []
for w,y in y2019.items():
    urls_2019.append(url_gen(year = y, week = w))

In [None]:
urls_2020 = []
for w,y in y2020.items():
    urls_2020.append(url_gen(year = y, week = w))

In [None]:
urls = urls_2019 + urls_2020

## fetch for single url

In [None]:
url = urls[-2]
print(url)

In [None]:
def fetch_single_url(url):
    
    #Sending quest -> into soup
    response = requests.get(url)
    if response.status_code == 200:
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        
    return soup
    

### SOUP

In [None]:
soup = fetch_single_url(url)

In [None]:
soup.find_all('tr')

In [None]:
c = -1
for i in soup.find_all('script'):
    c += 1
    print(c)
    print(i)

In [None]:
str(soup.find_all('script')[12])

In [None]:
soup.find_all('tr')

### SELENIUM

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('headless')
# chrome_options.add_argument('window-size=1920x1080')

driver = webdriver.Chrome(options = chrome_options)
driver = webdriver.Chrome(executable_path=r'C:\Users\wschupp\Desktop\NOMNCA - retail price scrape\chromedriver.exe', options = chrome_options)
driver.get(url)

In [None]:
#prices w/ product name iterable .text
driver.find_elements_by_class_name('product')[0].text

In [None]:
driver.quit()

In [None]:
def fetch_single_url_selenium(url, executeable_path = r'C:\Users\wschupp\Desktop\NOMNCA - retail price scrape\chromedriver.exe'):
    
    # Initialising driver
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('headless')
    
    driver = webdriver.Chrome(options = chrome_options)
    driver = webdriver.Chrome(executable_path=executeable_path, options = chrome_options)
    driver.get(url)
    
    #fetching name, id, prices
    contents = _contents(driver.find_elements_by_class_name('product'))
    
    #adding dates to each content in contents
    for i in contents:
        i['year'] = int(url[-4:])
        i['week'] = int(url[-7:-5])
    
    # closing browser
    driver.quit()
#     return driver
    return contents
    

In [None]:
fetch_single_url_selenium(url)

In [None]:
driver = fetch_single_url_selenium(url = url)

In [None]:
_contents(driver.find_elements_by_class_name('product'))

In [None]:
def _contents(webelements):
    '''
    Loops over each row
    '''
    container = []
    
    for webelement in webelements:
        container.append(_row_contents(webelement))
    
    return container
    

In [None]:
def _row_contents(webelement):
    
    container = {}
    
    #id
    container['product_id'] = webelement.get_attribute('data-product-id')
    
    #long str with all content
    content_str = webelement.text
    content_list = content_str.split('\n')
    
    #names and type
    container['product_name'] = content_list[0].strip()
    container['product_type'] = content_list[1].strip()
    
    #Prices
    price_list = content_list[2].split(' ')
    price_list = [float(i) for i in price_list]
    
    container['price_KIWI'] = price_list[0]
    container['price_MENY'] = price_list[1]
    container['price_OBS'] = price_list[2]
    container['price_REMA 1000'] = price_list[3]
    container['price_SPAR'] = price_list[4]
    container['price_EXTRA'] = price_list[5]

    
    return container
    
    
    

In [None]:
_row_contents(driver.find_elements_by_class_name('product')[0])

In [None]:
#ID FOR PRODUCT
driver.find_elements_by_class_name('product')[0].get_attribute('data-product-id')

In [None]:
#prices w/ product name iterable .text
driver.find_elements_by_class_name('product')[0].text.split('\n')[2].split(' ')