In [None]:
from urllib.error import HTTPError, URLError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import os
import os.path

In [None]:
def scrap(link, category, tries):
    if 'https://' in link:
        link = link.replace('https://', '')
    # Save page
    key = '1783319260d70f84da21868ce0fd6207'
    api_link = f"http://api.scraperapi.com?api_key={key}&url={link}"
    path = f'excel_dir/{category}.csv'
    if not os.path.exists(path):
        while True:
            try:
                page=urlopen(api_link)
                return page
            except HTTPError:
                tries+=1
                print(f'\tScrap: Failed - Attempt: {tries}')
                if tries >= 5:
                    print(f'\tScrap: Failed!')
                    break
                scrap(link, category, tries)
                pass
    else:
        print(f'\t{category} has already been scrapped')
        return None

In [None]:
def write(page, category):
    # Read data on page
    html_bytes=page.read()
    # Decode data from page
    html=html_bytes.decode("utf-8")
    # Write html file from data scrapped
    with open(f"html_dir/{category}.html","w",encoding="utf-8")as html_file:
        html_file.write(html)

In [None]:
def read(category):    
    # Read html file from scrapped data
    with open(f"html_dir/{category}.html","r",encoding='utf-8')as html_file:
        output=html_file.read()
        
        soup = BeautifulSoup(output, 'lxml')
        if category == 'skjermer' or category == 'skjermkort':
            products = soup.find_all('div', class_ = 'product-list-item subscription-price-visible')
        else:
            products = soup.find_all('div', class_ = 'product-list-item')
            
        product_dict = {}
        for product in products:
            link = str('komplett.no' + product.a['href'])
            image = str('komplett.no/img/p/800/' + product.a['href'].split('/')[2] + '.jpg')
            sale = None
            price_now = product.find('span', class_= 'product-price-now').text
            price_now = int(''.join(char for char in price_now if char.isalnum()))
            price_before = product.find('div', class_= 'product-price-before')
            if price_before != None:
                price_before = product.find('div', class_= 'product-price-before').text.replace('Før', '').replace(',-', '').strip()
                price_before = int(''.join(char for char in price_before if char.isalnum()))
                sale = True
            
            name = product.h2.text.replace('å', 'aa').replace('ø', 'o')
            stats = product.p.text.replace(',','').split()
            #available = product.find('span', class_='stockstatus-stock-details').text.replace('å', 'aa').replace('ø', 'o').split(' ')[0]
            try:
                available = product.find('span', class_='stockstatus-stock-details').text.replace('å', 'aa').replace('ø', 'o').split(' ')[0].replace('+', '')
                if available == 'Ikke' or available == 'Bestillingsvare.':
                    available = 0
            except AttributeError as error:
                    available = error
                    
            itemnumber = product.find('div', class_='product-data').text.replace(' ', '').replace('\n', '').split('/')[0].split(':')[1]

            product_dict[itemnumber] = [name, price_now, price_before, sale, available, stats, image, link, 'komplett', category]       
        
        return product_dict

In [None]:
links = [
        'https://www.komplett.no/category/12831/hvitevarer/stekeovner?nlevel=10639%C2%A712831&hits=48',
        'https://www.komplett.no/category/12839/hvitevarer/kjoeleskap?',
        'https://www.komplett.no/category/21169/mobiler-klokker/smartklokker?nlevel=10444%C2%A721169&hits=120',
        'https://www.komplett.no/category/21064/mobiler-klokker/mobiltelefoner?nlevel=10444%C2%A721064&hits=216',
        'https://www.komplett.no/category/10088/datautstyr/lagring/harddiskerssd?nlevel=10000%C2%A728001%C2%A710088&hits=288',
        'https://www.komplett.no/category/12840/hvitevarer/frysere',
        'https://www.komplett.no/category/12821/hvitevarer/toerketromler',
        'https://www.komplett.no/category/753/hvitevarer/mikroboelgeovner?',
        'https://www.komplett.no/category/12824/hvitevarer/oppvaskmaskiner?nlevel=10639%C2%A712824&hits=48',
        'https://www.komplett.no/category/11157/tv-lyd-bilde/tv-video/tv-er?nlevel=10719%C2%A730000%C2%A711157&hits=240',
        'https://www.komplett.no/category/11158/datautstyr/skjermer/skjermer?nlevel=10000%C2%A710392%C2%A711158&hits=264',
        'https://www.komplett.no/category/21635/gaming/gaming-utstyr/gaming-tastatur?nlevel=10431%C2%A721603%C2%A721635&hits=168',
        'https://www.komplett.no/category/199889/hjem-fritid/stoevsugere-rengjoering?nlevel=10560%C2%A7199889&hits=120',
        'https://www.komplett.no/category/21605/gaming/gaming-utstyr/gaming-mus?nlevel=10431%C2%A721603%C2%A721605&hits=168',
        'https://www.komplett.no/category/21640/gaming/gaming-utstyr/gaming-headset?nlevel=10431%C2%A721603%C2%A721640&hits=192',
        'https://www.komplett.no/category/21607/gaming/gaming-utstyr/gaming-musematte?nlevel=10431%C2%A721603%C2%A721607&hits=72',
        'https://www.komplett.no/category/21650/gaming/gaming-utstyr/spillkontrollere?hits=120',
        'https://www.komplett.no/category/11204/datautstyr/pc-komponenter/prosessorer?nlevel=10000%C2%A728003%C2%A711204&hits=144',
        'https://www.komplett.no/category/10111/datautstyr/pc-komponenter/hovedkort?nlevel=10000%C2%A728003%C2%A710111&hits=240',
        'https://www.komplett.no/category/10412/datautstyr/pc-komponenter/skjermkort?nlevel=10000%C2%A728003%C2%A710412&hits=360',
        ]

In [None]:
def komplett_scrap(links):

    # Check if we have the correct directories
    if not os.path.exists('html_dir'):
        os.mkdir('html_dir')
    if not os.path.exists('excel_dir'):
        os.mkdir('excel_dir')

    for link in links:
        tries = 0
        
        # Category on the item in our current link
        category = link.split('?')[0].split('/')[-1].replace('-','_')
        print(category)
        # Write down the data we get from scrapping
        scrapped = scrap(link, category, tries)
        if scrapped != None:
            print('\tScrap: Success!')
            print('\tWriting...')
            write(scrapped, category)
            print('\tReading...')
            current = read(category)
            # Used for columns in excel
            col = ['name', 'price_now', 'price_before', 'sale', 'available', 'stats', 'image', 'link', 'site', 'category']
            # Create dataframe
            df = pd.DataFrame(current.values(), columns = col)
            #df = df.loc[df['sale']==True]
            # Convert dataframe to excel file
            df.to_csv(f'excel_dir/{category}.csv')
            print(f'\tScrapped: {category}')

In [None]:
komplett_scrap(links)