# Mine Diamond Data

## Introduction
In this notebook we will mine our data from as many sources as possible to prepare our dataset. We will target diamond merchants on the web, starting with Brilliant Earth (sorry Brilliant Earth... But thanks for the data).

We'll start by importing our packages for scraping and regular expression, then quickly making a function to get the page content of a link. 

In [1]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time

In [2]:
def get_page_content(page_link):
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content)
    return(page_content)

In [3]:
def cleanhtml(raw_html):
    """
    Remove HTML tags from string.
    """
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return(cleantext)

## Blue Nile

In [4]:
# For conciseness, Blue Nile we will denote as 'bn'
bn_link = 'https://www.bluenile.com/uk/diamond-search'

In [5]:
def get_bn_headers(page_content):
    """
    Retrieves the headers for our Blue Nile dataframe
    """
    headers_grid = page_content.find('div',{'class':'grid-header normal-header'})
    headers_row = headers_grid.find('div', {'class':'row'})
    
    # Find all headers, and remove the tags from the string
    headers_containers = []
    for div in headers_row.find_all('div'):
        headers_containers.append(cleanhtml(str(div.find('span'))))
    
    # Remove all 'None' string values from list
    headers = list(filter(('None').__ne__, headers_containers))
    headers.remove('Compare')
    
    return(headers)

In [6]:
bn_page_content = get_page_content(bn_link)
bn_headers = get_bn_headers(bn_page_content)
print(bn_headers)

['Shape', 'Price', 'Carat', 'Cut', 'Colour', 'Clarity', 'Polish', 'Symmetry', 'Fluorescence', 'Depth', 'Table', 'L/W', 'Price/Ct', 'Culet', 'Stock No.', 'Dispatch Date']


In [80]:
browser = webdriver.Chrome('C:/Users/Edward Sims/Downloads/chromedriver.exe')
browser.get('https://www.bluenile.com/uk/diamond-search?track=NavDiaSea')

In [81]:
# Uncheck the 360 view option for more data
view_checkbox = browser.find_element_by_class_name('bn-checkbox')
view_checkbox.click()
time.sleep(2)

# Open more filters
more_filters = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[13]')
more_filters.click()
time.sleep(2)

# Open polish option
polish_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[1]/div[1]/div/div/div')
polish_add.click()
time.sleep(2)

# Open symmetry option
symmetry_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[2]/div[1]/div/div/div')
symmetry_add.click()
time.sleep(2)

# Open fluorescence option
fluorescence_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[3]/div[1]/div/div')
fluorescence_add.click()
time.sleep(2)

# Open depth % option
depth_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[4]/div[1]/div/div')
depth_add.click()
time.sleep(2)

# Open table % option
table_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[5]/div[1]/div/div')
table_add.click()
time.sleep(2)

# Open L/W Ratio option
lw_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[6]/div[1]/div/div')
lw_add.click()
time.sleep(2)

# Add culet column
culet_add = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[12]/div[8]/div[2]/button')
culet_add.click()
time.sleep(2)

In [82]:
princess_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[2]/div[3]')
emerald_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[3]/div[3]')
asscher_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[4]/div[3]')
cushion_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[5]/div[3]')
marquise_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[6]/div[3]')
radiant_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[7]/div[3]')
oval_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[8]/div[3]')
pear_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[9]/div[3]')
heart_details = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[6]/div[2]/div/div[10]/div[3]')

princess_details.click()
time.sleep(0.5)
emerald_details.click()
time.sleep(1)
asscher_details.click()
time.sleep(1)
cushion_details.click()
time.sleep(0.5)
marquise_details.click()
time.sleep(0.5)
radiant_details.click()
time.sleep(1)
oval_details.click()
time.sleep(0.5)
pear_details.click()
time.sleep(1)
heart_details.click()
time.sleep(0.5)

In [148]:
def get_bn_data():
    """
    Loops through all the price values in tens, scrapes
    """
    bn_headers = []

    # Get the headers for our table
    headers_data = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/section/div/div/div[1]/div')
    headers_html = BeautifulSoup(headers_data.get_attribute('innerHTML'))
    
    for div in headers_html.find_all('div'):
        for header in div.find_all('span'):
            bn_headers.append(cleanhtml(str(header)))
    bn_headers = list(filter(('').__ne__, bn_headers))
    bn_headers.remove('Compare')
    
    bn_df = pd.DataFrame(columns=headers)
    
    min_price = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[7]/div[2]/div/div[1]/input[1]')
    max_price = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[7]/div[2]/div/div[1]/input[2]')

    # Get the min and max values (minus £ and comma values)
    min_price_value = int(min_price.get_attribute('value')[1:].replace(',', ''))
    max_price_value = int(max_price.get_attribute('value')[1:].replace(',', ''))
    
    # Find a neutral zone to click on
    neutral = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/div[1]/div[2]/div[3]/div[7]/div[1]/h3')
    
    # Loop through prices to limit numbers displayed - ISSUE WITH DUPES/OVERLAPS?   
    for min_val in range(min_price_value, 231, 10):
        
        # Edit min price
        min_price.click()
        min_price.send_keys(Keys.BACKSPACE)
        time.sleep(1)
        min_price.send_keys(str(min_val))
        neutral.click()
        #time.sleep(1.4)
        
        # Edit max price
        max_price.click()
        max_price.send_keys(Keys.BACKSPACE)
        time.sleep(2)
        max_price.send_keys(str(min_val+10))
        neutral.click()
        #time.sleep(1.2)
        
        # Scrape the table! First get the raw table html
        for a in browser.find_elements_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/section/div/div/div[2]'):
            table_web_source = browser.find_element_by_xpath('//*[@id="react-app"]/div/div/div/section[1]/section/div/div/div[2]')
            table_html = BeautifulSoup(table_web_source.get_attribute('innerHTML'))
            table_rows_html = table_html.find_all('a',{'class':'grid-row row '})
            #time.sleep(1.8)
            
            
            # Loop through each row 
            for row in table_rows_html:
                bn_data = []
                # Loop through each value and store in list
                for value in row.find_all('span'):
                    bn_data.append(cleanhtml(str(value)))
                bn_data = list(filter(('').__ne__, bn_data)) # Remove all empty values
                bn_data = []
                print(bn_data)
        #bn_df = bn_df.append(dict(zip(bn_headers,bn_data)),ignore_index=True)


    
    return()

In [149]:
get_bn_data()

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]


KeyboardInterrupt: 

In [105]:
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [106]:
test_headers = ['a','b','c']
test_data = [1,2,3]

In [108]:
data.append(dict(zip(test_headers,test_data)), ignore_index=True)

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9
3,1,2,3


In [135]:
data = []

In [136]:
data.append(1)