# Mine Diamond Data

## Introduction
In this notebook we will mine our data from as many sources as possible to prepare our dataset. We will target diamond merchants on the web, starting with Brilliant Earth (sorry Brilliant Earth... But thanks for the data).

We'll start by importing our packages for scraping and regular expression, then quickly making a function to get the page content of a link. 

In [1]:
from bs4 import BeautifulSoup
import requests
import re

In [2]:
def get_page_content(page_link):
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content)
    return(page_content)

In [3]:
def cleanhtml(raw_html):
    """
    Remove HTML tags from string.
    """
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return(cleantext)

## Blue Nile

In [4]:
# For conciseness, Blue Nile we will denote as 'bn'
bn_link = 'https://www.bluenile.com/uk/diamond-search'

In [5]:
def get_bn_headers(page_content):
    """
    Retrieves the headers for our Blue Nile dataframe
    """
    headers_grid = page_content.find('div',{'class':'grid-header normal-header'})
    headers_row = headers_grid.find('div', {'class':'row'})
    
    # Find all headers, and remove the tags from the string
    headers_containers = []
    for div in headers_row.find_all('div'):
        headers_containers.append(cleanhtml(str(div.find('span'))))
    
    # Remove all 'None' string values from list
    headers = list(filter(('None').__ne__, headers_containers))
    return(headers)

In [38]:
bn_page_content = get_page_content(bn_link)
bn_headers = get_bn_headers(bn_page_content)
print(bn_headers)

['Compare', 'Shape', 'Price', 'Carat', 'Cut', 'Colour', 'Clarity', 'Polish', 'Symmetry', 'Fluorescence', 'Depth', 'Table', 'L/W', 'Price/Ct', 'Culet', 'Stock No.', 'Dispatch Date']


In [55]:
def find_bn_default_filters(page_content):
    page_content.find('div',{'class':'filter-container image-type-filter toggled'}).find('input')             # Image view
    page_content.find('div',{'class':'astor-filter-layout inline-filter'}).find('input')                      # ASTOR checkbox
    
    page_content.find('div',{'class':'filter-container price-filter toggled'}).find_all('input')              # Price
    page_content.find('div',{'class':'filter-container carat-filter toggled with-tooltip'}).find_all('input') # Carat
    page_content.find('div',{'class':'astor-filter-layout inline-filter'}).find('input')                      
    page_content.find('div',{'class':'astor-filter-layout inline-filter'}).find('input') 
    page_content.find('div',{'class':'astor-filter-layout inline-filter'}).find('input') 
    

In [73]:
bn_page_content.find('div',{'class':'astor-filter-layout inline-filter'}).find('input')

<input type="checkbox"/>

In [150]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [159]:
browser = webdriver.Chrome('C:/Users/Edward Sims/Downloads/chromedriver.exe')

In [160]:
browser.get('https://www.brilliantearth.com/loose-diamonds/search/')

In [161]:
oval_details = browser.find_element_by_class_name('oval-details')
cushion_details = browser.find_element_by_class_name('cushion-details')
princess_details = browser.find_element_by_class_name('princess-details')
pear_details = browser.find_element_by_class_name('pear-details')
emerald_details = browser.find_element_by_class_name('emerald-details')
marquise_details = browser.find_element_by_class_name('marquise-details')
asscher_details = browser.find_element_by_class_name('asscher-details')
radiant_details = browser.find_element_by_class_name('radiant-details')
heart_details = browser.find_element_by_class_name('heart-details')

In [162]:
oval_details.click()
cushion_details.click()
princess_details.click()
pear_details.click()
emerald_details.click()
marquise_details.click()
asscher_details.click()
radiant_details.click()
heart_details.click()

In [176]:
browser.find_element_by_id('min_price_display').clear()
browser.find_element_by_id('max_price_display').clear()

In [193]:
min_price = browser.find_element_by_id('min_price_display')

In [194]:
min_price.send_keys('420')

In [197]:
min_price.send_keys(Keys.RETURN)

ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=74.0.3729.169)
  (Driver info: chromedriver=74.0.3729.6 (255758eccf3d244491b8a1317aa76e1ce10d57e9-refs/branch-heads/3729@{#29}),platform=Windows NT 10.0.18362 x86_64)


In [188]:
max_price = browser.find_element_by_id('max_price_display')

In [190]:
max_price.send_keys('5000')

In [7]:
page_table = bn_page_content.find('div',{'class':'grid-body'})

In [39]:
for a in page_table.find_all('a'):
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        print(div)

<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>
<div class="shape-ce

In [37]:
shapes = []
for a in page_table.find_all('a'):
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    for div in a.find_all('div',{'class':'shape-cell-wrapper'}):
        shapes.append(cleanhtml(str(div)))
    
print(rows_containers)

['Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round', 'Round']


In [22]:
rows_containers

[<div class="cell-border-right compare" style="display:table-cell"><label class="comparison-diamonds-wrapper"><span class="comparison-star"></span></label></div>,
 <div class="row-cell shape"><div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div></div>,
 <div class="shape-cell-wrapper"><span><span class="diamond-search-sprite RD-360"></span></span><span class="single-cell">Round</span></div>,
 <div class="row-cell price"><span class="">£277.20</span></div>,
 <div class="row-cell carat"><span class="">0.23</span></div>,
 <div class="row-cell cut"><div class="cell-wrapper"><span class="label">Good</span><span class="label-small-view">Good</span></div></div>,
 <div class="cell-wrapper"><span class="label">Good</span><span class="label-small-view">Good</span></div>,
 <div class="row-cell color"><span class="">I</span></div>,
 <div class="row-cell clarity"><span class="">SI1</span></div>,
 <div class="row-c

In [None]:
cleanhtml(str(rows[3]))