## Scrapping SnowFlake English Site

In [None]:
!wget --recursive --html-extension --page-requisites --convert-links "https://docs.snowflake.com/en"

## Body+URL+Metadata Extraction

In [None]:
from bs4 import BeautifulSoup
import os
import csv
import re

def extract_body_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    footer = soup.find('footer')
    if footer:
        footer.decompose()
    nav = soup.find('nav')
    if nav:
        nav.decompose()
    language_picker = soup.find(id="language-picker")
    if language_picker:
        language_picker.decompose()
    body_text = soup.body.get_text(separator='\n', strip=True)
    return body_text

def extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Check if the first link under the head starts with "https://docs.snowflake.com"
    first_link_head = soup.head.find('link', href=re.compile('^https://docs.snowflake.com'))
    if first_link_head:
        first_href = first_link_head.get('href')
        return first_href

    # If no suitable link is found in the head, we will search the entire document
    # for the first link starting with "https://docs.snowflake.com"
    first_a_tag = soup.find('a', href=re.compile('^https://docs.snowflake.com'))
    first_href = first_a_tag.get('href') if first_a_tag else None
    return first_href


def extract_metadata(html_content, filename):
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.title.string if soup.title else "No title found"

    # Regular expression pattern to remove "| Snowflake Documentation" or "- Snowflake Documentation" from the title
    pattern = r'\s*[|\-]\s*Snowflake\s+Documentation$'
    title = re.sub(pattern, '', title)

    link = extract_links(html_content)
    text = extract_body_text(html_content)

    metadata = {
        "URL": link,
        "Title": title.strip(),
        "Text": text,
    }
    return metadata

def parse_html_files(input_dir, output_csv):
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["URL","Title","Text"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        # Recursively walk through the directory structure
        for root, dirs, files in os.walk(input_dir):
            for filename in files:
                if filename.endswith(".html"):
                    with open(os.path.join(root, filename), 'r', encoding='utf-8') as file:
                        html_content = file.read()
                        metadata = extract_metadata(html_content, filename)
                        writer.writerow(metadata)

# Example usage:
input_directory = "."  # Specify the top parent directory
output_csv_file = "preprocessed_data.csv"
parse_html_files(input_directory, output_csv_file)


# Confirmation

In [1]:
import pandas as pd

data = pd.read_csv("preprocessed_data.csv")

data.head()

Unnamed: 0,URL,Title,Text
0,https://docs.snowflake.com,Getting Started,Overview\nBefore you Begin\nLogging In\nSnowsi...
1,https://docs.snowflake.com,Guides,Overview\nConnecting to Snowflake\nVirtual war...
2,https://docs.snowflake.com,Developer,Overview\nSnowpark Library\nSnowpark API\nSnow...
3,https://docs.snowflake.com/en/reference,Reference,Overview\nSQL Data Types Reference\nSQL Comman...
4,https://docs.snowflake.com,Tutorials,Choose Category\nFeatured\nGetting Started\nDa...
