In [1]:
import requests, gzip
import pandas as pd
from io import BytesIO


In [6]:

url = "https://www.flipkart.com/sitemap_v_view-browse.xml.gz"
# url = "https://www.google.com/sitemap.xml"
# url = "https://www.amazon.com/sitemap.xml"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36"
}

try:
    res = requests.get(url, headers=headers, timeout=15)
    res.raise_for_status()

    content = res.content

    # Check if gzipped
    if url.endswith(".gz") or res.headers.get("Content-Encoding") == "gzip":
        xml_content = gzip.decompress(content).decode("utf-8", errors="ignore")
    else:
        xml_content = content.decode("utf-8", errors="ignore")

    # Parse XML into DataFrame
    df = pd.read_xml(BytesIO(xml_content.encode("utf-8")))

    if "loc" in df.columns:
        print("Total URLs found:", len(df))
        print(df["loc"].head(10))  # first 10 URLs
    else:
        print("⚠️ 'loc' column not found in sitemap XML")

except Exception as e:
    print("❌ Error:", e)


Total URLs found: 25220
0    https://www.flipkart.com/clothing-and-accessor...
1    https://www.flipkart.com/hi/clothing-and-acces...
2    https://www.flipkart.com/clothing-and-accessor...
3    https://www.flipkart.com/hi/clothing-and-acces...
4    https://www.flipkart.com/clothing-and-accessor...
5    https://www.flipkart.com/hi/clothing-and-acces...
6    https://www.flipkart.com/clothing-and-accessor...
7    https://www.flipkart.com/hi/clothing-and-acces...
8    https://www.flipkart.com/clothing-and-accessor...
9    https://www.flipkart.com/hi/clothing-and-acces...
Name: loc, dtype: object


In [4]:
df['loc']

0        https://www.flipkart.com/clothing-and-accessor...
1        https://www.flipkart.com/hi/clothing-and-acces...
2        https://www.flipkart.com/clothing-and-accessor...
3        https://www.flipkart.com/hi/clothing-and-acces...
4        https://www.flipkart.com/clothing-and-accessor...
                               ...                        
25215    https://www.flipkart.com/hi/televisions/huidi~...
25216    https://www.flipkart.com/televisions/full-hd~r...
25217    https://www.flipkart.com/hi/televisions/full-h...
25218    https://www.flipkart.com/televisions/ultra-hd-...
25219    https://www.flipkart.com/hi/televisions/ultra-...
Name: loc, Length: 25220, dtype: object

In [None]:
keywords = ["faq", "faqs", "qna", "customer-care","Question & Answer"]
mask = df['loc'].str.lower().str.contains('|'.join(keywords))
faq_links = df[mask]

faq_links['loc'].tolist()


In [None]:
df[df['loc'].str.lower().str.contains('faqs')]