In [1]:
import json

import pandas as pd
from pathlib import Path

### Read in sitemaps

Works for large numbers of sitemaps, even though I currently only have 2, in case I want to expand the scope of the project

In [None]:
DATA_DIR = Path.cwd().parent.parent / "data"
filepaths = sorted(DATA_DIR.glob("dell_sitemaps_success_*.json"))
print("Number of files:", len(filepaths))
sitemaps = []
for idx, filepath in enumerate(filepaths, start=1):
    with open(filepath, "r", encoding="utf-8") as f:
         sitemaps.append(json.load(f))

In [3]:
for idx, sitemap in enumerate(sitemaps):
    print(f"length of sitemaps[{idx}]: {len(sitemap):,}")
    print(f"number of links in sitemaps[{idx}]: {sum(len(v) for v in sitemap.values()):,}")

length of sitemaps[0]: 2
number of links in sitemaps[0]: 89,862
length of sitemaps[1]: 2,587
number of links in sitemaps[1]: 532,760


### sitemaps[0]

In [4]:
next(reversed(sitemaps[0].values()))[:100]

['https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_tr-tr.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_ru-rs.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_ko-kr.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_zh-tw.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_ja-jp.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_desktop/esuprt_alienware_dsk/alienware-area51-alx_owners-manual_zh-cn.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_laptop/esuprt_alienware_laptops/alienware-18_setup%20guide_de-de.pdf',
 'https://downloads.dell.com/manuals/all-products/esuprt_laptop/esuprt_ali

On visual inspection, sitemaps[0] appears to have a variety of PDF documents all in different languages, with the English documents in en-us

In [5]:
print(f"Number of sitemaps with en-uk: {len([url for url_list in sitemaps[0].values() for url in url_list if "en-uk" in url]):,}")
print(f"Number of sitemaps with en-us: {len([url for url_list in sitemaps[0].values() for url in url_list if "en-us" in url]):,}")
print(f"Total sitemaps: {len(sitemaps[0].values()):,}")

Number of sitemaps with en-uk: 0
Number of sitemaps with en-us: 12,773
Total sitemaps: 2


In [17]:
pd.Series({name: sum("en-us" in url for url in url_list) for name, url_list in sitemaps[0].items()})

https://www.dell.com/downloads-dell-pdfs-sitemap-0002.xml.gz    6105
https://www.dell.com/downloads-dell-pdfs-sitemap-0001.xml.gz    6668
dtype: int64

In [18]:
pd.Series({name: sum("en-us" in url and "pdf" in url for url in url_list) for name, url_list in sitemaps[0].items()})

https://www.dell.com/downloads-dell-pdfs-sitemap-0002.xml.gz    6105
https://www.dell.com/downloads-dell-pdfs-sitemap-0001.xml.gz    6668
dtype: int64

All documents in sitemaps[0] that we want to scrape are PDFs

### sitemaps[1]

In [7]:
next(reversed(sitemaps[1].values()))

['https://www.dell.com/support/home/en-us/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-uk/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-ca/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-au/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/da-dk/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/de-at/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/de-ch/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/de-de/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-hk/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-ie/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-in/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-my/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-nz/drivers/supportedos/xps-430',
 'https://www.dell.com/support/home/en-sg/drivers/supportedos/xp

sitemaps[1] appears to have a different support page in each of its leaf sitemap, with each sitemap containing many versions of this same page in different languages.

There are several different English pages, with British English ones containing en-uk

In [8]:
print(f"Number of sitemaps with en-uk: {len([url for url_list in sitemaps[1].values() for url in url_list if "en-uk" in url]):,}")
print(f"Number of sitemaps with en-us: {len([url for url_list in sitemaps[1].values() for url in url_list if "en-us" in url]):,}")
print(f"Total sitemaps: {len(sitemaps[1].values()):,}")

Number of sitemaps with en-uk: 2,624
Number of sitemaps with en-us: 2,625
Total sitemaps: 2,587


In [9]:
pd.Series({name: sum("en-uk" in url for url in url_list) for name, url_list in sitemaps[1].items()}).value_counts()

1    2550
2      37
Name: count, dtype: int64

Every leaf sitemap has at least 1 page in en-uk

### Combining the URLs from the sitemaps

In [10]:
urls_to_scrape = pd.Series(
    [url for url_list in sitemaps[0].values() for url in url_list if "en-us" in url] +
    [url for url_list in sitemaps[1].values() for url in url_list if "en-uk" in url],
    name="url"
)


print(f"Total URLs: {len(urls_to_scrape):,}")
print(f"Unique URLs: {urls_to_scrape.nunique():,}")

urls_to_scrape = urls_to_scrape.drop_duplicates()
print(f"Final URLs to scrape: {len(urls_to_scrape):,}")

Total URLs: 15,397
Unique URLs: 13,817
Final URLs to scrape: 13,817


In [16]:
print(f"The longest url string contains {len(urls_to_scrape.loc[urls_to_scrape.str.len().idxmax()])} characters")

The longest url string contains 217 characters


In [11]:
urls_to_scrape.to_csv(DATA_DIR / "urls_to_scrape.csv", index=False)