In [None]:
import requests
from bs4 import BeautifulSoup as Soup
import pandas as pd
import hashlib
import datetime


def parse_sitemap( url,headers):
    # First of all we make a request to the specified url in the function parameters.
    resp = requests.get(url)
    # if we didn't get a valid response, return false 
    status = resp.status_code 
    if (status != 200):
        return False
    # Then we parse the content of the response using BeautifulSoup4.
    soup = Soup(resp.content, "xml")

    #Then we look for either a urlset or a sitemapindex
    urls = soup.findAll('url')
    sitemaps = soup.findAll('sitemap')
    #create pandas dataframe
    new_list = ["Source"] + headers
    panda_out_total = pd.DataFrame([], columns=new_list)


    if not urls and not sitemaps:
        return False

    # Recursive call to the the function if sitemap contains sitemaps
    if sitemaps:
        for url in sitemaps:
            sitemap_url = url.find('loc').string
            panda_recursive = parse_sitemap(sitemap_url, headers)
            panda_out_total = pd.concat([panda_out_total, panda_recursive], ignore_index=True)

    # storage for later...
    out = []

    # Creates a hash of the parent sitemap for faster indexing
    hash_sitemap = hashlib.md5(str(url).encode('utf-8')).hexdigest()

    # Extract the keys we want
    for u in urls:
        values = [hash_sitemap]
        for head in headers:
            loc = None
            loc = u.find(head)
            if not loc:
                loc = "None"
            else:
                loc = loc.string
            values.append(loc)
        out.append(values)
    
    # Creates a dataframe
    panda_out = pd.DataFrame(out, columns= new_list)

    # If recursive then merge recursive dataframe
    if not panda_out_total.empty:
        panda_out = pd.concat([panda_out, panda_out_total], ignore_index=True)

    #returns the dataframe
    return panda_out

In [None]:
# Do the web scraping for each url 

def get_product_info(results_page):

    # find product material - No product material 
    product_material = "NaN"
    
    # find product title
    title = results_page.find('h1', class_ = 'product__title')
    display_name= title.text

    # find product color
    color_span = results_page.find('span', class_ = 'product__swatch-text')
    if color_span == None:
        color = "N/A"
    else:
        color = str(color_span.text).replace(" ", "")

    # find product url 
    product_url = url 

    # find price of the product 
    price_html = results_page.find('span', class_ = 'product__price')
    price = str(price_html.text).replace(" ", "")

    #find image link
    image_link_color = []
    images_html = results_page.find('div', class_ = 'product-images')
    images = images_html.find_all('img')
    for im in images:
        image_link_color.append(im['src'])


    # find size of the product 
    sizes =  []
    size_html = results_page.find('div', class_ = 'swatch size')
    if size_html == None:
        sizes = ['N/A']
    else:
        size_elements = size_html.find_all('input')
        for product_size in size_elements:
            size = product_size['value']
            sizes.append(size)

    # find brand name 
    brand_name = "Mate the label"

    # find description
    description = results_page.find('div', class_="product-accordions__single-content", id = "product-accordions__details").text


    # alternative: use meta to locate information
    # test = results_page.find("meta", property="og:title")
    # print(test['content'])


    # find low level, if no category, set the default as product name, regex it later on
    low_level = display_name
    # default = no 
    second_hand = 'No'
    # female
    gender = "female"
    # today's date
    scrapped_date = datetime.today().strftime('%Y-%m-%d')

    new_row = {'display_name': display_name,
               'product_material': product_material,
               'color': color, 'size': size,
               'price':price, 'product_url':product_url,
               'image_link_color':image_link_color,
               'brand_name':brand_name,
               'description':description,
               'scrapped_date': scrapped_date,
               'low_level':low_level,
               'gender':gender,
               'secondhand': second_hand }

    global df
    df = df.append(new_row, ignore_index=True)


In [None]:
result = parse_sitemap("https://www.primark.com/en-us/sitemap.xml", ["loc"])
df = result
df = df.drop(columns='Unnamed: 0')
result_test = df
result_test

In [None]:
import re
product_urls = []
non_product_urls = []
df = pd.DataFrame(columns= ['id_style','display_name', 'materials', 'color', 'size', 'price', 'currency', 'product_url', 'image_links', 'brand_name', 'retailer', 'description', 'scrapped_date', 'high_level', 'low_level', 'gender', 'secondhand', 'shipping_from', 'style'])

for url in result_test[result_test.columns[0]]:
    print(type(str(url.string)))
    product = False
    response = requests.get(url)
    if response.status_code == 200:
        print("Success")
    else:
        print("Failure")
    results_page = Soup(response.content,'html')
    # Input html class selector to verify if the page is a product page
    # Refer to Category Links EXCEL for html_element and class_id
    html_element = 'button'
    class_id = 'product-panel__collapse-button js-collapse-button'
    if results_page.find(html_element, class_ = class_id) != None:
        product_urls.append(url)
        product = True
    else:
        print('hello')
        non_product_urls.append(url)

    #### if product == TRUE, then start scraping #### 

    if product ==True:

        # find product material - No product material 
        materials = "NaN"

        id_style = "NaN"
        
        # find product title
        title = results_page.find('h1', class_ = 'product-panel__name heading heading--h5')
        display_name= title.text

        # find product color
        color = "NaN"

        # find product url 
        product_url = url 

        # find price of the product 
        price_html = results_page.find('p', class_ = 'heading heading--h5')
        price= price_html.text

        # find currency
        currency = results_page.find('div', 'product-detail product-view-item')['data-currency']

        #find image link
        image_links = []
        images_html = results_page.find("meta", property="og:image")
        image_links.append(images_html['content'])

        print(image_links)
        gender = 'women'
        if 'men' in str(url):
            gender = 'men'


        category = results_page.find('div', 'product-detail product-view-item')['data-category']


        brand_name = 'Primark'
        retailer = 'Primark'
        shipping_from = 'NaN'
        high_level = "NaN"
        style = 'NaN'
        size = 'NaN'
        secondhand='No'
        scrapped_date=str(datetime.date.today())
        description = "NaN"

        new_row = {'display_name': display_name,
               'materials': materials,
               'color': color, 'size': size,
               'price':price, 'currency': currency,'product_url':product_url,
               'image_links':image_links,
               'brand_name':brand_name,
               'retailer' : retailer,
               'description':description,
               'scrapped_date': scrapped_date,
               'high_level':high_level,
               'low_level':category,
               'gender':gender,
               'secondhand': secondhand, 
               'shipping_from': shipping_from, 
               'style': style }

        global df
        df = df.append(new_row, ignore_index=True)
    













<class 'str'>
Success
hello
<class 'str'>
Failure
hello
<class 'str'>
Failure
hello
<class 'str'>
Failure
hello
<class 'str'>
Failure
hello
<class 'str'>
Failure
hello
<class 'str'>
Success
['https://primedia.primark.com/s/primark/120905409_ms?locale=en-US,en-US,en-*,*&$zoom$?$poi$&w=1200&sm=aspect&aspect=1200:628']
women
Toiletries: Soaps, Scrubs & Bath Essentials
<class 'str'>
Success
['https://primedia.primark.com/s/primark/125272268_ms?locale=en-US,en-US,en-*,*&$zoom$?$poi$&w=1200&sm=aspect&aspect=1200:628']
women
Makeup Tools: Brushes, Curlers & More
<class 'str'>
Success
['https://primedia.primark.com/s/primark/127855821_ms?locale=en-US,en-US,en-*,*&$zoom$?$poi$&w=1200&sm=aspect&aspect=1200:628']
women
Foundation, Concealer, Primers, Powders & More
<class 'str'>
Success
['https://primedia.primark.com/s/primark/127855854_ms?locale=en-US,en-US,en-*,*&$zoom$?$poi$&w=1200&sm=aspect&aspect=1200:628']
women
Foundation, Concealer, Primers, Powders & More
<class 'str'>
Success
['https://

In [None]:
df

Unnamed: 0,id_style,display_name,materials,color,size,price,currency,product_url,image_links,brand_name,retailer,description,scrapped_date,high_level,low_level,gender,secondhand,shipping_from,style
0,,5-Pack Colorful Toothbrush Covers,,,,\n\t\t\t$1.50,USD,https://www.primark.com/en-us/all-products/cos...,[https://primedia.primark.com/s/primark/120905...,Primark,Primark,,2021-11-12,,"Toiletries: Soaps, Scrubs & Bath Essentials",women,No,,
1,,15-Pack Wooden Cuticle Stick,,,,\n\t\t\t$2.00,USD,https://www.primark.com/en-us/all-products/cos...,[https://primedia.primark.com/s/primark/125272...,Primark,Primark,,2021-11-12,,"Makeup Tools: Brushes, Curlers & More",women,No,,
2,,Bake It Setting Powder,,,,\n\t\t\t$4.00,USD,https://www.primark.com/en-us/all-products/cos...,[https://primedia.primark.com/s/primark/127855...,Primark,Primark,,2021-11-12,,"Foundation, Concealer, Primers, Powders & More",women,No,,
3,,Bake It Setting Powder,,,,\n\t\t\t$2.00,USD,https://www.primark.com/en-us/all-products/cos...,[https://primedia.primark.com/s/primark/127855...,Primark,Primark,,2021-11-12,,"Foundation, Concealer, Primers, Powders & More",women,No,,
4,,Bake It Setting Powder,,,,\n\t\t\t$4.00,USD,https://www.primark.com/en-us/all-products/cos...,[https://primedia.primark.com/s/primark/127855...,Primark,Primark,,2021-11-12,,"Foundation, Concealer, Primers, Powders & More",women,No,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,,Dusky Pink Premium T-Shirt Bra In Sizes D-F,,,,\n\t\t\t$10.00,USD,https://www.primark.com/en-us/all-products/wom...,[https://primedia.primark.com/s/primark/130691...,Primark,Primark,,2021-11-12,,Bras,men,No,,
187,,Black Basic Button Glasses Case,,,,\n\t\t\t$0.50,USD,https://www.primark.com/en-us/all-products/wom...,[https://primedia.primark.com/s/primark/130693...,Primark,Primark,,2021-11-12,,Women's Sunglasses,men,No,,
188,,Beige Basic Heart Button Glasses Case,,,,\n\t\t\t$3.50,USD,https://www.primark.com/en-us/all-products/wom...,[https://primedia.primark.com/s/primark/130693...,Primark,Primark,,2021-11-12,,Women's Sunglasses,men,No,,
189,,Hot Pink Seamfree Lace Trim Cropped Camisole,,,,\n\t\t\t$2.00,USD,https://www.primark.com/en-us/all-products/wom...,[https://primedia.primark.com/s/primark/130529...,Primark,Primark,,2021-11-12,,Women's Pyjamas,men,No,,


In [None]:
df.to_csv('primark_table.csv', sep='\t', encoding='utf-8')
from google.colab import files
files.download("primark_table.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>