In [None]:
import requests
from bs4 import BeautifulSoup as Soup
import pandas as pd
import hashlib

def parse_sitemap( url,headers):
    # First of all we make a request to the specified url in the function parameters.
    resp = requests.get(url)
    # if we didn't get a valid response, return false 
    status = resp.status_code 
    if (status != 200):
        return False
    # Then we parse the content of the response using BeautifulSoup4.
    soup = Soup(resp.content, "xml")

    #Then we look for either a urlset or a sitemapindex
    urls = soup.findAll('url')
    sitemaps = soup.findAll('sitemap')
    #create pandas dataframe
    new_list = ["Source"] + headers
    panda_out_total = pd.DataFrame([], columns=new_list)


    if not urls and not sitemaps:
        return False

    # Recursive call to the the function if sitemap contains sitemaps
    if sitemaps:
        for url in sitemaps:
            sitemap_url = url.find('loc').string
            panda_recursive = parse_sitemap(sitemap_url, headers)
            panda_out_total = pd.concat([panda_out_total, panda_recursive], ignore_index=True)

    # storage for later...
    out = []

    # Creates a hash of the parent sitemap for faster indexing
    hash_sitemap = hashlib.md5(str(url).encode('utf-8')).hexdigest()

    # Extract the keys we want
    for u in urls:
        values = [hash_sitemap]
        for head in headers:
            loc = None
            loc = u.find(head)
            if not loc:
                loc = "None"
            else:
                loc = loc.string
            values.append(loc)
        out.append(values)
    
    # Creates a dataframe
    panda_out = pd.DataFrame(out, columns= new_list)

    # If recursive then merge recursive dataframe
    if not panda_out_total.empty:
        panda_out = pd.concat([panda_out, panda_out_total], ignore_index=True)

    #returns the dataframe
    return panda_out

In [None]:
result = parse_sitemap("https://matethelabel.com/sitemap.xml", ["loc"])
result

Unnamed: 0,Source,loc
0,ee0f06c5515449ce9dc23dca683448b7,https://matethelabel.com/
1,ee0f06c5515449ce9dc23dca683448b7,https://matethelabel.com/products/linen-short-...
2,ee0f06c5515449ce9dc23dca683448b7,https://matethelabel.com/products/linen-crop-t...
3,ee0f06c5515449ce9dc23dca683448b7,https://matethelabel.com/products/organic-cott...
4,ee0f06c5515449ce9dc23dca683448b7,https://matethelabel.com/products/organic-cott...
...,...,...
453,352befa79371e26297c2a3c6dd49047f,https://matethelabel.com/blogs/mate-journal/q-...
454,352befa79371e26297c2a3c6dd49047f,https://matethelabel.com/blogs/mate-journal/ma...
455,352befa79371e26297c2a3c6dd49047f,https://matethelabel.com/blogs/mate-journal/kr...
456,352befa79371e26297c2a3c6dd49047f,https://matethelabel.com/blogs/mate-journal/q-...


In [None]:
import re
result_final = []
for url in result[result.columns[1]]:
  if url[0:34] == 'https://matethelabel.com/products/':
      result_final.append(url)


panda_new = pd.DataFrame(result_final, columns= ['products'])

Unnamed: 0,products
0,https://matethelabel.com/products/linen-short-...
1,https://matethelabel.com/products/linen-crop-t...
2,https://matethelabel.com/products/organic-cott...
3,https://matethelabel.com/products/organic-cott...
4,https://matethelabel.com/products/organic-cott...
...,...
257,https://matethelabel.com/products/mens-fleece-...
258,https://matethelabel.com/products/mens-fleece-...
259,https://matethelabel.com/products/organic-cott...
260,https://matethelabel.com/products/organic-cott...


In [None]:
panda_new['products']

0                        https://matethelabel.com/products/linen-short-natural
1                     https://matethelabel.com/products/linen-crop-top-natural
2      https://matethelabel.com/products/organic-cotton-classic-tee-true-white
3            https://matethelabel.com/products/organic-cotton-classic-tee-bone
4        https://matethelabel.com/products/organic-cotton-classic-tee-charcoal
                                        ...                                   
257       https://matethelabel.com/products/mens-fleece-sweatpant-heather-grey
258          https://matethelabel.com/products/mens-fleece-sweatpant-jet-black
259          https://matethelabel.com/products/organic-cotton-beanie-jet-black
260            https://matethelabel.com/products/organic-cotton-beanie-natural
261             https://matethelabel.com/products/organic-cotton-beanie-sedona
Name: products, Length: 262, dtype: object

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# create a dateframe to store the informaiton

import pandas as pd

df = pd.DataFrame(columns= ['display_name', 'product_material', 'color', 'size', 'price', 'product_url', 'image_link_color', 'brand_name', 'description', 'scrapped_date', 'low_level', 'gender', 'secondhand'])

df


Unnamed: 0,display_name,product_material,color,size,price,product_url,image_link_color,brand_name,description,scrapped_date,low_level,gender,secondhand


In [None]:
# Do the web scraping for each url 


from datetime import datetime

url_lists = panda_new['products']

# url_lists = ['https://matethelabel.com/products/linen-short-natural']

for url in url_lists:
    giftcard = 'giftcard'
    if giftcard in str(url):
        print("not gonna count gift card")
        continue
    print(url)
    response = requests.get(url)
    if response.status_code == 200:
        print("Success")
    else:
        print("Failure")
    results_page = Soup(response.content,'html')

    # find product material
    for items in results_page.find_all('div', class_="product-accordions__single-content", id = "product-accordions__nutrition-information"):
        regex = r"(.*?-.*?)-.*"
        match = re.search(regex, str(items.text))
        if match == None:
            product_material = str(items.text)
        else:
            product_material = (match.group(1))
        print(product_material)
        

                
        # match = re.search(regex, str(items)) 

        # regex = r"^(.+?)."
        # match = re.search(regex, items.string)
        # print((match.group(0)))
        
        # for item in items:
        #     print(item.string)
    
    # find product title
    title = results_page.find('h1', class_ = 'product__title')
    display_name= title.text

    # find product color



    color_span = results_page.find('span', class_ = 'product__swatch-text')
    if color_span == None:
        color = "N/A"
    else:
        color = str(color_span.text).replace(" ", "")

    # find product url 
    product_url = url 

    # find price of the product 
    price_html = results_page.find('span', class_ = 'product__price')
    price = str(price_html.text).replace(" ", "")

    #find image link
    image_link_color = []
    images_html = results_page.find('div', class_ = 'product-images')
    images = images_html.find_all('img')
    for im in images:
        image_link_color.append(im['src'])
        print(image_link_color)


    # find size of the product 
    sizes =  []
    size_html = results_page.find('div', class_ = 'swatch size')
    if size_html == None:
        sizes = ['N/A']
    else:
        size_elements = size_html.find_all('input')
        for product_size in size_elements:
            size = product_size['value']
            sizes.append(size)

    # not done 

    # find brand name 
    brand_name = "Mate the label"

    # find description
    description = results_page.find('div', class_="product-accordions__single-content", id = "product-accordions__details").text


    # alternative: use meta to locate information
    # test = results_page.find("meta", property="og:title")
    # print(test['content'])


    # find low level, if no category, set the default as product name, regex it later on
    low_level = display_name
    # default = no 
    second_hand = 'No'
    # female
    gender = "female"
    # today's date
    scrapped_date = datetime.today().strftime('%Y-%m-%d')

    new_row = {'display_name': display_name,
               'product_material': product_material,
               'color': color, 'size': size,
               'price':price, 'product_url':product_url,
               'image_link_color':image_link_color,
               'brand_name':brand_name,
               'description':description,
               'scrapped_date': scrapped_date,
               'low_level':low_level,
               'gender':gender,
               'secondhand': second_hand }
    df = df.append(new_row, ignore_index=True)
    

print( " DONE DONE DONE DONE")

    










https://matethelabel.com/products/linen-short-natural
Success
               - 100% Natural Linen  
['//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2300_300x300.jpg?v=1621633340']
['//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2300_300x300.jpg?v=1621633340', '//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2304_300x300.jpg?v=1621633340']
['//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2300_300x300.jpg?v=1621633340', '//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2304_300x300.jpg?v=1621633340', '//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2326_300x300.jpg?v=1621633340']
['//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2300_300x300.jpg?v=1621633340', '//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2304_300x300.jpg?v=1621633340', '//cdn.shopify.com/s/files/1/051

In [None]:
df = df.replace(r'\n',' ', regex=True) 
df

Unnamed: 0,display_name,product_material,color,size,price,product_url,image_link_color,brand_name,description,scrapped_date,low_level,gender,secondhand
0,Linen Short,- 100% Natural Linen,Natural,XL,$78,https://matethelabel.com/products/linen-short-natural,"[//cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2300_300x300.jpg?v=1621633340, //cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2304_300x300.jpg?v=1621633340, //cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2326_300x300.jpg?v=1621633340, //cdn.shopify.com/s/files/1/0518/6281/products/LINENLONGSLEEVESHIRTLOOK_03_2332_300x300.jpg?v=1621633340]",Mate the label,"When in doubt, just breathe. These mid-rise 100% Natural Linen shorts make it easy. Super relaxed with a soft elastic waist, a flattering butterfly-shaped hem, and convenient back pockets.",2021-11-05,Linen Short,female,No
1,Linen Crop Top,- 100% Natural Linen,Natural,XL,$74,https://matethelabel.com/products/linen-crop-top-natural,"[//cdn.shopify.com/s/files/1/0518/6281/products/779NAT-1_5c6fdfc6-45d5-4afb-9829-439250a2be9f_300x300.jpg?v=1623954057, //cdn.shopify.com/s/files/1/0518/6281/products/779NAT-2_b7862d37-669c-4251-9d31-57c4a97118fd_300x300.jpg?v=1623954165, //cdn.shopify.com/s/files/1/0518/6281/products/779NAT-2_1886845a-74a0-4d64-8c06-04422dae939e_300x300.jpg?v=1623954133, //cdn.shopify.com/s/files/1/0518/6281/products/781NAT-5_300x300.jpg?v=1623954133]",Mate the label,Sweeten up your look with this 100% Natural Linen crop top featuring a heart-shaped topline. Wide straps and a relaxed shape keep things casual and cool.,2021-11-05,Linen Crop Top,female,No
2,Organic Cotton Classic Tee,- 100% Organic Cotton Jersey,True White,3X,$48,https://matethelabel.com/products/organic-cotton-classic-tee-true-white,"[//cdn.shopify.com/s/files/1/0518/6281/products/912TWHT-8_300x300.jpg?v=1606947294, //cdn.shopify.com/s/files/1/0518/6281/products/912TWHT-5_300x300.jpg?v=1606947294, //cdn.shopify.com/s/files/1/0518/6281/products/MATEEXTENDED103020205299_300x300.jpg?v=1606982727, //cdn.shopify.com/s/files/1/0518/6281/products/MATEEXTENDED103020205297_300x300.jpg?v=1606982727, //cdn.shopify.com/s/files/1/0518/6281/products/912TWHT-9_300x300.jpg?v=1606947294, //cdn.shopify.com/s/files/1/0518/6281/products/MATEEXTENDED103020205295_300x300.jpg?v=1606982727]",Mate the label,Every woman needs her crew and ours is one you can definitely count on. It's got the perfect relaxed fit and is made with 100% Organic Cotton Jersey that is so super soft. The search for the perfect crew neck tee is over.,2021-11-05,Organic Cotton Classic Tee,female,No
3,Organic Cotton Classic Tee,- 100% Organic Cotton Jersey,Bone,3X,$48,https://matethelabel.com/products/organic-cotton-classic-tee-bone,"[//cdn.shopify.com/s/files/1/0518/6281/products/764BON-1_656f28a9-d53d-4cf4-88ae-0e5ff920e555_300x300.jpg?v=1629225175, //cdn.shopify.com/s/files/1/0518/6281/products/764BON-2_300x300.jpg?v=1629225177, //cdn.shopify.com/s/files/1/0518/6281/products/764BON-3_300x300.jpg?v=1629225179, //cdn.shopify.com/s/files/1/0518/6281/products/6687fb489d714e56a85e1af84f0b79de.thumbnail.0000000_small.jpg?v=1629225865, //cdn.shopify.com/s/files/1/0518/6281/products/764BON-4_300x300.jpg?v=1629225182, //cdn.shopify.com/s/files/1/0518/6281/products/764BON-5_300x300.jpg?v=1629225184, //cdn.shopify.com/s/files/1/0518/6281/products/764BON-6_1a8df1db-2ef3-4e2a-b8c4-e79aa5f120ac_300x300.jpg?v=1629225192, //cdn.shopify.com/s/files/1/0518/6281/products/SKIN-IMAGE-05_300x300.jpg?v=1629328081]",Mate the label,Every woman needs her crew and ours is one you can definitely count on. It's got the perfect relaxed fit and is made with 100% Organic Cotton Jersey that is so super soft. The search for the perfect crew neck tee is over.,2021-11-05,Organic Cotton Classic Tee,female,No
4,Organic Cotton Classic Tee,- 100% Organic Cotton Jersey,Charcoal,3X,$48,https://matethelabel.com/products/organic-cotton-classic-tee-charcoal,"[//cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-3_6545a489-08d6-47f2-bfdb-519bc0ffd515_300x300.jpg?v=1629231222, //cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-5_300x300.jpg?v=1629231239, //cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-1_e77a1b4d-d284-40de-ab09-0cb668057e11_300x300.jpg?v=1629231213, //cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-2_300x300.jpg?v=1629231219, //cdn.shopify.com/s/files/1/0518/6281/products/0e33079ba2774a20becff05ccf5122e0.thumbnail.0000000_small.jpg?v=1629233236, //cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-4_1ab48e7e-1d3a-42f1-b1cb-0acafb90cc0e_300x300.jpg?v=1629231227, //cdn.shopify.com/s/files/1/0518/6281/products/764CHAR-6_0c736fc5-57a7-45eb-93ad-0fae78d07aa7_300x300.jpg?v=1629231242, //cdn.shopify.com/s/files/1/0518/6281/products/SKIN-IMAGE-03_300x300.jpg?v=1629328084]",Mate the label,Every woman needs her crew and ours is one you can definitely count on. It's got the perfect relaxed fit and is made with 100% Organic Cotton Jersey that is so super soft. The search for the perfect crew neck tee is over.,2021-11-05,Organic Cotton Classic Tee,female,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,Men's Fleece Sweatpant,"- 50% Organic Cotton, 50% Cotton Fleece",Heather Grey,XXL,$108,https://matethelabel.com/products/mens-fleece-sweatpant-heather-grey,"[//cdn.shopify.com/s/files/1/0518/6281/products/Heather_Casey_0099_fb31d35a-4f81-4dba-8d97-66e6a4cd73ac_300x300.jpg?v=1635174422, //cdn.shopify.com/s/files/1/0518/6281/products/Heather_Casey_0096_300x300.jpg?v=1635174428, //cdn.shopify.com/s/files/1/0518/6281/products/Heather_Casey_0114_300x300.jpg?v=1635174403, //cdn.shopify.com/s/files/1/0518/6281/products/Heather_Casey_0174_300x300.jpg?v=1635174384, //cdn.shopify.com/s/files/1/0518/6281/products/Matethelabel6172a4e3cf31d76172a4e3cf4d2.288090386172a4e3cf4d2_300x300.jpg?v=1634903290, //cdn.shopify.com/s/files/1/0518/6281/products/Heather_Casey_0172_8e2f96aa-7939-4601-8873-20e22f68d5f7_300x300.jpg?v=1635182998]",Mate the label,"Every guy’s got their stay-at-home sweats—but these will take you way beyond the couch. Our premium blend of 50% Organic Cotton and 50% Cotton Fleece is soft and luxe for a comfort level that’s completely unmatched. Plus, it’s designed with an internal drawcord so you can customize your perfect relaxed fit.",2021-11-05,Men's Fleece Sweatpant,female,No
257,Men's Fleece Sweatpant,"- 50% Organic Cotton, 50% Cotton Fleece",Jet Black,XXL,$108,https://matethelabel.com/products/mens-fleece-sweatpant-jet-black,"[//cdn.shopify.com/s/files/1/0518/6281/products/Matethelabel6172a4e3ea7b476172a4e3ea9ba.857024146172a4e3ea9ba_300x300.jpg?v=1634903294, //cdn.shopify.com/s/files/1/0518/6281/products/Black_Casey_0195_300x300.jpg?v=1635126746, //cdn.shopify.com/s/files/1/0518/6281/products/Black_Casey_0064_300x300.jpg?v=1635126765, //cdn.shopify.com/s/files/1/0518/6281/products/Black_Casey_0053_dae229ba-b8ba-4668-95c4-e9af13682165_300x300.jpg?v=1635182896, //cdn.shopify.com/s/files/1/0518/6281/products/Black_Casey_0088_c31d6567-bad3-4199-975b-9010d3529166_300x300.jpg?v=1635126818, //cdn.shopify.com/s/files/1/0518/6281/products/Garfield_Solo_0120_300x300.jpg?v=1635182531]",Mate the label,"Every guy’s got their stay-at-home sweats—but these will take you way beyond the couch. Our premium blend of 50% Organic Cotton and 50% Cotton Fleece is soft and luxe for a comfort level that’s completely unmatched. Plus, it’s designed with an internal drawcord so you can customize your perfect relaxed fit.",2021-11-05,Men's Fleece Sweatpant,female,No
258,Organic Cotton Beanie,- 100% Organic Cotton,Jet Black,OS,$48,https://matethelabel.com/products/organic-cotton-beanie-jet-black,"[//cdn.shopify.com/s/files/1/0518/6281/products/Matethelabel6172adb03685186172adb036a5b.856472626172adb036a5b_300x300.jpg?v=1634905566, //cdn.shopify.com/s/files/1/0518/6281/products/Black_Garfield_0025_300x300.jpg?v=1635553082]",Mate the label,Our cozy cuffed beanie is locally made in Los Angeles with 100% Organic Cotton yarn and designed for a universally flattering fit. This unisex style has an adjustable fold-up cuff so you can customize the size and wear it any way you like—over the ears or behind. It's the perfect accessory for cool morning walks or chilly nights out. Super soft and sustainable—win-win.,2021-11-05,Organic Cotton Beanie,female,No
259,Organic Cotton Beanie,- 100% Organic Cotton,Natural,OS,$48,https://matethelabel.com/products/organic-cotton-beanie-natural,"[//cdn.shopify.com/s/files/1/0518/6281/products/Matethelabel6172adb058ef526172adb05902c.629775846172adb05902c_300x300.jpg?v=1634905569, //cdn.shopify.com/s/files/1/0518/6281/products/Bone_Casey_0183_300x300.jpg?v=1635553098]",Mate the label,Our cozy cuffed beanie is locally made in Los Angeles with 100% Organic Cotton yarn and designed for a universally flattering fit. This unisex style has an adjustable fold-up cuff so you can customize the size and wear it any way you like—over the ears or behind. It's the perfect accessory for cool morning walks or chilly nights out. Super soft and sustainable—win-win.,2021-11-05,Organic Cotton Beanie,female,No


In [None]:
df.to_csv('matethelabel_table.csv', sep='\t', encoding='utf-8')


In [None]:
from google.colab import files
files.download("matethelabel_table.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>