In [2]:
### General imports and parameters ###
import requests
import json
import os

base_path = "/lakehouse/default/Files/module_data"

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 6, Finished, Available)

In [3]:
### Load mslearn modules metadata ###

response = requests.get("https://learn.microsoft.com/api/catalog/?locale=en-en&type=modules")
content = response.json()
modules = content["modules"]

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 7, Finished, Available)

In [51]:
len(modules)

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 55, Finished, Available)

3801

In [53]:
### Save metadata as json documents in OneLake ###

from glob import glob

module_urls = []
for module in modules:
    if "/modules/" in module["url"]:

        folder = module["url"].split("/modules/")[1].split('/')[0]
        path = os.path.join(base_path, folder)
        os.makedirs(path, exist_ok=True)

        # Disregard metadata already saved
        if not os.path.isfile(os.path.join(path, "metadata.json")):
            with open(os.path.join(path, "metadata.json"), 'w') as outfile:
                json.dump(module, outfile)

        # Disregard module text already saved
        if len(glob(path+"/*.txt")) < 2:
            module_urls.append(module["url"])


StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 57, Finished, Available)

In [54]:
len(module_urls)

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 58, Finished, Available)

3325

In [66]:
### Set up webcrawler for getting module content ###

import scrapy

class ModuleSpider(scrapy.Spider):
    name = "mslearn_module"

    def __init__(self, urls):
        super().__init__()
        self.module_urls = urls
        # self.start_requests()

    def start_requests(self):
        for url in self.module_urls:
            yield scrapy.Request(url=url, callback=self.get_module_pages)

    def get_module_pages(self, response):
        url_base = response.url.split("?")[0]

        pages = response.xpath("//ul[@id='unit-list']")[0]
        pages = pages.xpath("//a[@data-linktype='relative-path']")
        for page in pages:
            page_url = url_base + page.xpath("@href").get()
            yield scrapy.Request(url=page_url, callback=self.parse_page)

    def parse_page(self, response):
        page_text = response.xpath("//div[@id='unit-inner-section']//text()").getall()
        page_text = [line for line in page_text if not line.startswith("\n\t")]

        rel_path = response.url.split("/modules/")[1].split('/')
        folder = rel_path[0]
        filename = f"{rel_path[1]}.txt"

        # print(filename)
        with open(os.path.join(base_path, folder, filename), 'w') as f:
            f.write(str(page_text))
        # Path(os.path.join(folder, filename)).write_bytes(response.body)
        self.log(f"Saved file {filename}")

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 70, Finished, Available)

In [None]:
### Get text content from mslearn modules ###

from scrapy.crawler import CrawlerProcess

process = CrawlerProcess({'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
process.crawl(ModuleSpider, urls=module_urls)
process.start()

In [68]:
process.stop()

StatementMeta(, 63548f85-f4af-4778-b512-3d192ea6a97f, 72, Finished, Available)

<DeferredList at 0x79e8dfd9ab30 current result: []>

In [None]:
## Set up connection to Azure OpenAI ###

from notebookutils.mssparkutils.credentials import getSecret
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI

KEYVAULT_ENDPOINT = "https://mslearn-bot.vault.azure.net/"
key = getSecret(KEYVAULT_ENDPOINT, "openai-api-key")
base = getSecret(KEYVAULT_ENDPOINT, "openai-api-base")

client = AzureOpenAI(
    api_key=key,  
    api_version="2023-07-01-preview",
    azure_endpoint=base
    )

In [None]:
### Define embedding functionality with AzureOpenAI ###

embedding_model = "text-embedding-model"
def get_embedding(text):
    result = client.embeddings.create(
      model=embedding_model,
      input=text
    )
    result = result.data[0].embedding
    return result

test = get_embedding("Testing embedding function")
print(test[0:5])

In [23]:
### Optional method: Define embedding functionality with SynapseML ###

# import pandas as pd
# from synapse.ml.services import OpenAIEmbedding

# embedding = (
#     OpenAIEmbedding()
#     .setDeploymentName(embedding_model)
#     .setSubscriptionKey(key)
#     .setTextCol("content")
#     .setErrorCol("error")
#     .setOutputCol("content_vector")
# )

StatementMeta(, 17a8132d-cbcb-4f6f-8632-6d31a1b3b079, 27, Finished, Available)

In [None]:
### Append module contents to json files ###

folders = os.listdir(base_path)
folders = [file for file in folders if not '.' in file]
for folder in folders:

    path = os.path.join(base_path, folder)
    if os.path.isfile(path+".json"):
        print(path, "Already processed, skipping")
    
    else:
        files = os.listdir(path)
        files.sort(key=lambda x: x[0])
        files = files[:-1] # Don't pick the metadata json

        lessons = []
        for filename in files:
            # print(filename)
            with open(os.path.join(path, filename), 'r') as doc:
                lessons.append(doc.read())
                # os.remove(doc.name)

        with open(os.path.join(path, "metadata.json"), 'r') as metadata:
            data = json.load(metadata)
            data["content"] = str(lessons)

            # Embedding with AzureOpenAI
            try:
                data["content_vector"] = get_embedding(data["content"])
            except:
                print(path, "failed")

            # Embedding with SynapseML
            # df = pd.DataFrame({"content": [str(lessons)]})
            # df = spark.createDataFrame(df)
            # df = embedding.transform(df)
            # df = df.select("content").toPandas()
            # data["content_vector"] = df["content"]

            with open(path+".json", 'w') as outfile:
                json.dump(data, outfile)

            # os.remove(metadata.name)
        # os.removedirs(path)
