In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import chromedriver_binary  # Adds chromedriver binary to path
import json

import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'chromedriver_binary'

### getting single product info from product-specific url

In [None]:
def single_product(url, brand_name = 'Reformation'):  
    # Get html
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")
    
    # description
    description = soup.find('div', {'itemprop': "description"}).text
    
    # color
    color_list = soup.find_all('div', {'class': "simple-tooltip simple-tooltip--color simple-tooltip--bottom"})
    color = [ div.text for div in color_list ] if len(color_list) > 0 else None
    
    # fabric
    fabric_list = soup.find_all('li', {'class': 'pdp-product-data__padded'})
    fabric = [ elem.text.strip() for elem in fabric_list ] if len(fabric_list) > 0 else None
    
    # sustainability
    sus_list = soup.find_all('span', {'class': 'pdp-product-data__text'})
    sus = [ elem.text.strip() for elem in sus_list ] if len(sus_list) > 0 else None
    
    # product details
    detail_list = soup.find_all('li', {'class': 'pdp-product-data__hyphen'})
    detail = [ elem.text.strip() for elem in detail_list ] if len(detail_list) > 0 else None
    
    # image
    img_list = soup.find_all('img', {'class': 'snap-slider__media lazyload'})
    img = [ elem['data-src'] for elem in img_list ] if len(img_list) > 0 else None
    
    return {"description": description, 
            "color": color, 
            "fabric": fabric, 
            "sustainability": sus, 
            "product_detail": detail, 
            "image": img,
            "brand": brand_name}

### getting all products from a landing page (e.g. All Clothing, Dresses)

In [None]:
def get_all_from_landing_page(landing_url):
    res = []; id_list = []
    
    wd = webdriver.Chrome()
    wd.get(landing_url)
    wd.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    SCROLL_PAUSE_TIME = 5.0

    # Get scroll height
    last_height = wd.execute_script("return document.body.scrollHeight")

    while True:
        prod_sum_class = wd.find_elements_by_class_name('product-summary__name')

        for elem in prod_sum_class:
            
            # for every product
            atag = elem.find_element_by_tag_name("a")
            
            try:
                data_analytics = json.loads(atag.get_attribute('data-analytics'))
                id_ = data_analytics['payload']['id']

                if id_ not in id_list:
                    # extracting attributes (core url and name of the product)
                    url = atag.get_attribute('href')
                    name = atag.get_attribute('innerHTML')

                    # product information from beautifulsoup (url as input)
                    product_dict = single_product(url)

                    # adding attributes to the product dictionary
                    product_dict.update({'id': id_,
                                         'product_name': name,
                                         'data_analytics': data_analytics,
                                         'url': url,
                                        })

                    res += [ product_dict ]
                    id_list += [ id_ ]
                    
                    print(name, len(res))
            except:
                continue

        # Scroll down to bottom
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = wd.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # example:
    # <h2 class="product-summary__name">
    # <a data-analytics='{"event":"productClick","domEvent":"click","payload":{"id":"1307406","name":"Roe Top","sku":"1307406IVO000","sale":false,"price":168.0,"category":"New","product_found_category":"","size":"0","color":"Ivory","breadcrumbs":"All Clothing"}}' href="/products/roe-top?color=Ivory&amp;via=Z2lkOi8vcmVmb3JtYXRpb24td2VibGluYy9Xb3JrYXJlYTo6Q2F0YWxvZzo6Q2F0ZWdvcnkvNWE2YWRmZDJmOTJlYTExNmNmMDRlOWM1">Roe Top</a>
    # </h2>

    wd.close()
    wd.quit()
    
    # now returns a dataframe
    return pd.DataFrame(res)

### Scrape and save to csv

In [None]:
def scrape_save(url, filename):
    print(filename)
    df = get_all_from_landing_page(url)
    df.to_csv(filename, index=False)
    return df

### All Clothing, Dresses, Jeans, Basics, Sweaters

In [None]:
all_clothing = get_all_from_landing_page("https://www.thereformation.com/categories/all-clothing")

Kim Dress 1
Virgo Dress 2
Cowboy High Rise Straight Jeans 3
Varenne Cashmere Tank And Cardi Set 4
Sigmund Dress 5
Candace Top 6
Antoinette  Dress 7
Roo Dress 8
Francine Skirt 9
Taylor Two Piece 10
Gem Dress 11
Osteria Open Back Sweater 12
Hart Cashmere Sweater 13
Jorja Two Piece 14
Boyfriend Sweatpant 15
Gavin Dress Es 16
Sorley Short 17
Westlake Dress 18
Rudi Dress 19
Evalina Dress 20
Cashmere Sweatsuit 21
Greenwich Coat 22
Jocelyn Dress 23
Elisabetta Dress 24
Lupine Dress 25
Luisa Cropped Cashmere Sweater 26
Foret Cable Knit Cardigan 27
Embry Dress 28
Mica Dress 29
Hari Dress 30
Cait Dress 31
Violet Dress 32
Prosecco Dress 33
Cynthia High Rise Straight Jeans 34
Marsha Dress 35
Emme Top 36
Kenmare Dress 37
Nell Top 38
Jax Dress 39
Mandi Dress 40
Chestnut Dress 41
Kailey Dress 42
Kylie Dress 43
Hunter Crop Sweatshirt 44
Relaxed Cropped Cashmere Crew 45
Cashmere Sweatpant 46
Julietta Top 47
Cashmere Polo Sweater 48
Juliette Dress 49
Radlee Dress 50
Lexington Two Piece 51
Roe Top 52
Boyf

In [None]:
all_clothing_df = pd.DataFrame(all_clothing)
all_clothing_df.to_csv('Reformation_all_clothing.csv', index=False)

In [None]:
dresses = get_all_from_landing_page("https://www.thereformation.com/categories/dresses")

Kim Dress 1
Sigmund Dress 2
Antoinette  Dress 3
Noa Dress 4
Jourdan Dress 5
Juliette Dress 6
Christine Dress 7
Embry Dress 8
Galena Dress 9
Radlee Dress 10
Marlowe Dress 11
Cassatt Dress 12
Amethyst Dress 13
Westlake Dress 14
Paprika Dress 15
Roo Dress 16
Virgo Dress 17
Elisabetta Dress 18
Jax Dress 19
Prosecco Dress 20
Violet Dress 21
Gavin Dress 22
Nikita Dress 23
Gem Dress 24
Jocelyn Dress 25
Kylie Dress 26
Rudi Dress 27
Evalina Dress 28
Rou Dress 29
Cameron Dress 30
Kailey Dress 31
Hari Dress 32
Kenmare Dress 33
Mochi Dress 34
Mica Dress 35
Breslin Dress 36
Marsha Dress 37
Lupine Dress 38
Mandi Dress 39
Moonlight Dress 40
Maya Dress 41
Lexington Two Piece 42
Cassi Dress 43
Petites Nikita Dress 44
Lunar Dress 45
Navy Dress 46
Chestnut Dress 47
Tam Dress 48
Celina Dress 49
Kenna Dress 50
Disco Dress 51
Cait Dress 52
Zion Two Piece 53
Rahm Dress 54
Paprika Dress Es 55
Rayne Dress 56
Bronti Dress 57
Gaston Dress 58
Midnight Dress 59
Heath Dress 60
Livie Dress 61
Marcy Dress 62
Tally Dr

In [None]:
dresses_df = pd.DataFrame(dresses)
dresses_df.to_csv('Reformation_dresses.csv', index=False)

In [None]:
jeans_df = get_all_from_landing_page("https://www.thereformation.com/categories/jeans")

Harper High Rise Skinny Jeans 1
Cowboy High Rise Straight Jeans 2
Cynthia High Rise Straight Jeans 3
Peyton High Rise Bootcut Jeans 4
Kayo High &amp; Skinny 5
Liza High Straight Jean 6
Liza High Straight Crop 7
Harper Ultra High Rise Skinny Jeans 8
Cynthia High Rise Straight Cropped Jeans 9
Harper Mid Rise Skinny Jeans 10
Peyton Slit Hem High Rise Bootcut Jeans 11
Kris High Rise Relaxed Curve Jeans 12
Harper High Rise Skinny Cropped Jeans 13
Marine Jean 14
Vintage High Straight Crop 15
Liza Button Fly High Rise Straight Jeans 16
Star Jean 17
Emma High Rise Wide Leg Jeans 18
Eloise High Rise Wide Leg Jeans 19
Newsprint High Rise Straight Long Jeans 20
Liza Ultra High Rise Straight Jeans 21
Julia Crop High Cigarette Jean 22
Saddle Jean 23
Serena High Skinny Jean 24
Cynthia Patch High Rise Straight Jeans 25
Petites Eloise Jean 26
Petites Cynthia High Relaxed Jean 27
Petites Liza High Straight Jean 28
Serena High Skinny Crop 29
Hailey Trouser Jean 30
Cynthia Quilted Jean 31
Petites High &a

In [None]:
jeans_df.to_csv('Reformation_jeans.csv', index=False)

In [None]:
basics_df = scrape_save('https://www.thereformation.com/categories/basics',
                        'Reformation_basics.csv')

Reformation_basics.csv
Kylie Dress 1
Hunter Crop Sweatshirt 2
Boyfriend Sweatpant 3
Cashmere Sweatsuit 4
Radlee Dress 5
Boyfriend Tee 6
Nisa Bodysuit 7
Classic Sweatpant 8
Boyfriend Longsleeve Tee 9
Dixon Waffle Short 10
Rikki Two Piece 11
Joplin Relaxed Tee 12
Marla Zip Sweatshirt 13
Alex Slim Tee 14
Marion Two Piece 15
Whitney Pant 16
Taylor Two Piece 17
Yale Waffle Zip 18
Boyfriend Knit Short 19
Hunter Classic Sweatshirt 20
Paige Top 21
Cashmere Henley Sweater 22
Brooks Classic Sweat Short 23
Tanner Classic Hoodie 24
Virgil Top 25
Keisha Top 26
Davis Waffle Top 27
Laguna Top 28
Riley Short 29
Manny Oversized Sweatshirt 30
Villa Two Piece Set 31
Stella Slim Scoop Tee 32
Davy Top 33
Oversized Cashmere Crew 34
Eddie Long Sleeve Tee 35
Kai Crop Tee 36
Relaxed V-Neck Tee 37
Cashmere Boyfriend Sweater 38
Maya Dress 39
Jackie Top 40
Amira Top 41
Bardot Top 42
Kendy Dress 43
Canyon Top 44
Cashmere Polo Sweater 45
Tam Dress 46
Muse Tee 47
Carrie Top 48
Delia Top 49
Rochelle Top 50
Giselle To

In [None]:
sweaters_df = scrape_save('https://www.thereformation.com/categories/sweaters-sweatshirts',
                         'Reformation_sweaters.csv')

Reformation_sweaters.csv
Cashmere Sweatsuit 1
Varenne Cashmere Tank And Cardi Set 2
Cashmere Polo Sweater 3
Osteria Open Back Sweater 4
Luisa Cropped Cashmere Sweater 5
Hart Cashmere Sweater 6
Tille Tank And Cardi Set 7
Cashmere Crew 8
Cashmere Henley Sweater 9
Foret Cable Knit Cardigan 10
Relaxed Cropped Cashmere Crew 11
Villa Two Piece Set 12
Cashmere Boyfriend Sweater 13
Oversized Cashmere Crew 14
Cashmere Crew Cardigan 15
Sanzo Short 16
Sami Cropped Sweater 17
Cashmere Boyfriend Turtleneck 18
Fossi Cashmere Cardigan 19
Faro Deep V Cardigan 20
Relaxed Cashmere Wrap 21
Montaigne Sleeveless Sweater 22
Georges Oversized Cashmere Sweater 23
Germaine Cropped Cardigan 24
Dita Cable Knit Sweater 25
Hunter Classic Sweatshirt 26
Basilica Cashmere Sweater 27
Dauphine Cable Knit Sweater 28
Slim Cashmere Turtleneck 29
Champs Pointelle Sweater 30
Cashmere Crew Puff Sleeve 31
Lemartine Cable Knit Cardigan 32
Cashmere Boyfriend Cardigan 33
Piazza Cashmere Sweater 34
Via Puff Sleeve Cardigan 35
Vec

### Combined

In [None]:
aggregated = pd.concat([all_clothing_df, dresses_df, jeans_df,
                       basics_df, sweaters_df]).reset_index(drop=True)

In [None]:
agg_unique = aggregated.iloc[aggregated.astype(str).drop_duplicates(subset='id', keep="first").index].reset_index(drop=True)

agg_unique.to_csv('Reformation_table.csv', index=False)

### Reading in Reformation_table.csv and adding price column from 'data_analytics' payload

In [None]:
import pandas as pd
import ast

d = pd.read_csv('Reformation_table.csv',dtype={'data_analytics':object})
d['data_analytics'] = d['data_analytics'].apply(ast.literal_eval)
d['price'] = d['data_analytics'].apply(lambda x: x['payload']['price'])

In [None]:
d

Unnamed: 0,description,color,fabric,sustainability,product_detail,image,brand,id,product_name,data_analytics,url,price
0,How dreamy. The Kim has a slim fitting bodice ...,['Buff'],"['Sustainably made in China', 'This is a light...","['8.0 lbs. of carbon dioxide savings', '4.0 ga...","['3 buttons at cuff', 'Center back cut out', '...",['https://media.thereformation.com/image/uploa...,Reformation,1307576,Kim Dress,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/kim-dr...,278.0
1,"Have we met. The Virgo is a vintage inspired, ...",['Almond'],"['Sustainably made in Los Angeles', 'This is a...","['9.0 lbs. of carbon dioxide savings', '4.0 ga...","['Bra friendly', 'Bustier seam detail', 'Cente...",['https://media.thereformation.com/image/uploa...,Reformation,1307231,Virgo Dress,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/virgo-...,298.0
2,Smart ass jeans. The Cowboy is slim fitting th...,['Chocolate'],"['Sustainably made in Los Angeles', 'This is a...","['22.0 lbs. of carbon dioxide savings', '8112....","['Back pockets', 'Center front button', 'Extra...",['https://media.thereformation.com/image/uploa...,Reformation,1306025,Cowboy High Rise Straight Jeans,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/cowboy...,128.0
3,Slip into something warmer. The Varenne is a s...,"['Oatmeal', 'Black', 'Dark Grey', 'Parisian Bl...","['Sustainably made in China', 'This is a light...","['822.0 lbs. of carbon dioxide savings', '3973...","['Oatmeal is a natural solid print', 'Boxy fit...",['https://media.thereformation.com/image/uploa...,Reformation,1307041,Varenne Cashmere Tank And Cardi Set,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/varenn...,228.0
4,Show some leg. The Sigmund is slim fitting in ...,"['Blitz', 'Black', 'Splatter', 'Boudoir']",['This is a lightweight drapey crepe fabric wi...,,"['Center back zipper', 'Elastic at cuff', 'Ela...",['https://media.thereformation.com/image/uploa...,Reformation,1305735,Sigmund Dress,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/sigmun...,248.0
...,...,...,...,...,...,...,...,...,...,...,...,...
545,Hope you're ready to get cozy. The Cesina is a...,,,,"['Long sleeve', 'Relaxed fitting', 'Ribbed fab...",['https://media.thereformation.com/image/uploa...,Reformation,1307047,Cesina Cashmere Sweater,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/cesina...,228.0
546,Stay warm out there. The Poste is a slightly o...,['Charcoal'],"['Sustainably made in China', 'This is an alpa...","['8.0 lbs. of carbon dioxide savings', '5.0 ga...","['Charcoal is a grey solid print', 'Hits below...",['https://media.thereformation.com/image/uploa...,Reformation,1307142,Poste Turtleneck Sweater,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/poste-...,228.0
547,Comfy. This is a classic pullover hoody with a...,"['Black', 'Gravel Heather']",,,['Gravel Heather is a grey solid print'],['https://media.thereformation.com/image/uploa...,Reformation,1303233,Patagonia Men's P6 Logo Uprisal Hoody,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/patago...,79.0
548,You probably need some Patagonia to go with th...,"['Oatmeal', 'Molten Lava']",['This is a medium weight polyester blend fabr...,,['Oatmeal is a natural solid print'],['https://media.thereformation.com/image/uploa...,Reformation,1303232,Patagonia Woolyester Fleece Pullover,"{'event': 'productClick', 'domEvent': 'click',...",https://www.thereformation.com/products/patago...,139.0


In [None]:
d.to_csv('Reformation_table.csv', index=False)