# Web scraper to extract intial specifications and product links

## DRAFT - currently under development

### Important Details:
Website: https://www.appliancesonline.com.au/

In [18]:
# Import modules
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

from time import sleep
import pandas as pd
import pickle

In [19]:
# find element : Helper function
def find_element(element, element_tag, element_type):
    result = None
    try:
        if(element_type == 'css_selector'):
            result = element.find_element_by_css_selector(element_tag).text
            
        elif (element_type == 'href'):
            result = element.find_element_by_class_name(element_tag).get_attribute('href')
            
    except NoSuchElementException:
        result = None
    
    return(result)   

In [20]:
# get_product_data : Helper function
def get_product_data(driver, data_dict):
    Cells = driver.find_elements_by_css_selector('aol-product[trackinglistname="grid"] div.inner')
    offset = len(data_dict)
    for i, cell in enumerate(Cells):
            
        # Convert data to a dictionary
        data_dict[i+offset] = {"ID": str(i+offset).zfill(5),
                               "Product_Name": find_element(cell, 'div.product-name', 'css_selector'),                                                     
                               "Product_Link": find_element(cell, 'body-link', 'href'),
                               "Price": find_element(cell, 'div.price', 'css_selector'),
                               "Review_Count": find_element(cell, 'span.label', 'css_selector'),
                               "Review_Score": find_element(cell, 'span.avg-rating', 'css_selector'),
                               "Original_Price": find_element(cell, 'div.amount', 'css_selector')}

    return(data_dict)

In [21]:
# openPage : Helper function
def openPage(url_prefix, url_suffix, driver):
    product_data= {}
    
    # Page to load
    page_to_load = url_prefix + '1' + url_suffix
    
    # Load the web page
    driver.get(page_to_load)
    sleep(2)
    
    # Identify the number of products
    product_count = int(driver.find_element_by_css_selector('div.products-bar div.products-count').text.split(" ")[0])
    print(f'There were {product_count} products detected')
    
    #Get data from the first page 
    product_data = get_product_data(driver, data_dict = product_data)
    
    # Get data from the rest of the pages
    for i in range(2,int((product_count/24)+2)):
        sleep(2)
        page_to_load = url_prefix + str(i) + url_suffix
        driver.get(page_to_load)
        sleep(2)
        product_data = get_product_data(driver, data_dict = product_data)            
    return(product_data)


In [22]:
# save_DataFrame : helper function
def save_DataFrame(df, filename):
    with open(filename, 'wb') as handle:
        pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [23]:
# load_DataFrame : helper function     
def load_DataFrame(filename):
    with open(filename, 'rb') as handle:
        df = pickle.load(handle)
        
    return(df)

# Main program

In [24]:
# Product page that lists the products
page_prefix = 'https://www.appliancesonline.com.au/filter/consumer-electronics/tvs/?currentpage='
page_suffix = '&sortkey=highestrated'  

# incognito mode
chrome_options = Options()
chrome_options.add_argument('--incognito')
driverPATH = 'C:\\University\\Master Class\\chromedriver_win32\\chromedriver.exe'
chrome_Browser = webdriver.Chrome(driverPATH, options=chrome_options)

# Get the product names and links and convert to a dataframe
product_dict = openPage(page_prefix, page_suffix, chrome_Browser)
product_df = pd.DataFrame(product_dict).T

#Close the browser
chrome_Browser.quit()

# Display the product_df
product_df.head(3)

There were 145 products detected


Unnamed: 0,ID,Product_Name,Product_Link,Price,Review_Count,Review_Score,Original_Price
0,0,Samsung 85 Inch Q70A 4K UHD QLED Smart TV QA85...,https://www.appliancesonline.com.au/product/sa...,"$5,771",from 1320 reviews,4.8,"$58 off RRP of $5,829"
1,1,Samsung 75 Inch Q70A 4K UHD QLED Smart TV QA75...,https://www.appliancesonline.com.au/product/sa...,"$3,464",from 1320 reviews,4.8,"$35 off RRP of $3,499"
2,2,Samsung 65 Inch Q70A 4K UHD QLED Smart TV QA65...,https://www.appliancesonline.com.au/product/sa...,"$2,085",from 1320 reviews,4.8,"$714 off RRP of $2,799"


In [25]:
# Fix some columns
product_df['Price'] = [int(price.replace('$','').replace(',','')) for price in product_df['Price'] if price is not None]
product_df['Review_Count'] = [int(review_count.split(" ")[1]) if review_count is not None else None for review_count in product_df['Review_Count']] 
product_df['Original_Price'] = [int(orig_price.replace(',','').split("$")[2]) if orig_price is not None else None for orig_price in product_df['Original_Price']]       

# Derived variable => Discount calculation
product_df['Discount'] = product_df['Original_Price'] - product_df['Price']
product_df

Unnamed: 0,ID,Product_Name,Product_Link,Price,Review_Count,Review_Score,Original_Price,Discount
0,00000,Samsung 85 Inch Q70A 4K UHD QLED Smart TV QA85...,https://www.appliancesonline.com.au/product/sa...,5771,1320.0,4.8,5829.0,58.0
1,00001,Samsung 75 Inch Q70A 4K UHD QLED Smart TV QA75...,https://www.appliancesonline.com.au/product/sa...,3464,1320.0,4.8,3499.0,35.0
2,00002,Samsung 65 Inch Q70A 4K UHD QLED Smart TV QA65...,https://www.appliancesonline.com.au/product/sa...,2085,1320.0,4.8,2799.0,714.0
3,00003,Samsung 55 Inch Q70A 4K UHD QLED Smart TV QA55...,https://www.appliancesonline.com.au/product/sa...,1774,1320.0,4.8,2209.0,435.0
4,00004,Samsung 65 Inch Q80A 4K UHD QLED Smart TV QA65...,https://www.appliancesonline.com.au/product/sa...,2950,1126.0,4.8,3489.0,539.0
...,...,...,...,...,...,...,...,...
140,00140,TCL 55 Inch C825 Mini LED 4K UHD HDR Smart QLE...,https://www.appliancesonline.com.au/product/tc...,1999,,,,
141,00141,TCL 50 Inch P725 4K UHD HDR Smart Android TV 5...,https://www.appliancesonline.com.au/product/tc...,605,,,799.0,194.0
142,00142,"LG 49LF5100 49"" 123cm Full HD LED LCD TV",https://www.appliancesonline.com.au/product/lg...,1088,,,1099.0,11.0
143,00143,TCL 43 Inch P725 4K UHD HDR Smart Android TV 4...,https://www.appliancesonline.com.au/product/tc...,685,,,699.0,14.0


In [26]:
# Save the product_df to disk

filename = input("SAVE FILENAME: Please enter the name of the file:")
save_DataFrame(df = product_df, filename=f'{filename}.pickle')
print('')
print(f'Dataframe has been saved to : {filename}.pickle')

SAVE FILENAME: Please enter the name of the file: TV_names_and_links_df



Dataframe has been saved to : TV_names_and_links_df.pickle
