In [9]:
import requests
from bs4 import BeautifulSoup
import bs4
import json
from urllib.parse import urljoin
import logging
import traceback

### Crawl a class

In [10]:
DEBUG = False

class ClassDocCrawler:
    def __init__(self) -> None:
        pass
            
    def CrawlClass(self, InClassDocUrl : str, data=[]):
        if not InClassDocUrl:
            logging.warning(f"In class doc url invalid {InClassDocUrl}")
            return data
        CurrentPageUrl = InClassDocUrl
        self.PageUrl = InClassDocUrl
        response = requests.get(CurrentPageUrl)
        soup = BeautifulSoup(response.text, 'html.parser', from_encoding="ISO-8859-1")
        paragraphs = soup.find_all('p')
        HeaderName, ModuleName, IncludePath = None, None, None
        ClassName = None
        Title = soup.find("div", attrs={"id" : "pageTitle"})
        if Title:
            ClassName = Title.text
        for idx, node in enumerate(paragraphs):
            # print(node.text)
            if node.text == "Header" and idx < len(paragraphs) - 1:
                HeaderName = paragraphs[idx + 1].text
            if node.text == "Module" and idx < len(paragraphs) - 1:
                ModuleName = paragraphs[idx + 1].text
            if node.text == "Include" and idx < len(paragraphs) - 1:
                IncludePath = paragraphs[idx + 1].text
            
            
        Rows = soup.find_all('tr',attrs={"class":"normal-row"})

        for Row in Rows[3:]:
            Description, Type, Name = self.ExtractFromRow(Row)
            if Name is None:
                continue
            data.append({
                "Module" : ModuleName,
                "HeaderPath" : IncludePath,
                "ClassName" : ClassName,
                "Name": Name,
                "Type": Type,
                "Description": Description,
            })
            if DEBUG:
                break
            
        return data

    def FilterText(self, InText : str) -> str:
        return InText.replace('Â', '').replace('\u00a0','').replace('\n\n','').strip('\n\u00a0\t\n\r')

    def IsShortened(self, InText: str) -> bool:
        return InText.endswith("...")

    def FindFullNameFromUrl(self, InTag: bs4.element.Tag) -> str:
        UrlTag = InTag.find('a')
        
        if not UrlTag:
            return self.FilterText(InTag.text)
        
        RelUrl = UrlTag.get('href')
        ResultUrl = urljoin(self.PageUrl, RelUrl)
        
        DetailResponse = requests.get(ResultUrl)
        DetailSoup = BeautifulSoup(DetailResponse.text, 'html.parser', from_encoding="ISO-8859-1")
        Name = DetailSoup.find('h1',attrs={'id' : 'H1TitleId'})
        if not Name:
            return InTag.text
        return Name.text
    
    def ExtractFromRow(self, Row: bs4.element.Tag) -> None:
        Descriptions = Row.find_all("td", attrs={"class" : "desc-cell"})
        Other = Row.find_all("td", attrs={"class" : "name-cell"})
        
        if len(Descriptions) != 1 or len(Other) != 2:
            # print(Row,"\n __________ \n")
            return None, None, None
        
        Description = self.FilterText( Descriptions[0].text)
        Type = self.FilterText(Other[0].text)
        Name = self.FilterText(Other[1].text)
        
        if self.IsShortened(Description):
            Description = self.FindFullNameFromUrl(Descriptions[0])
        if self.IsShortened(Type):
            Type = self.FindFullNameFromUrl(Other[0])
        if self.IsShortened(Name):
            Name = self.FindFullNameFromUrl(Other[1])
        
        return Description, Type, Name


### Crawl content from every Class from module

In [11]:
def CrawlModule(ModuleDocUrl : str, data = []):
    ClassCrawler = ClassDocCrawler()
    
    response = requests.get(ModuleDocUrl)
    Soup = BeautifulSoup(response.text, 'html.parser')
    
    ClassSection = Soup.find("div", attrs={"id" : "classes"})
    if not ClassSection:
        print(f"no class section in {ModuleDocUrl}")
        return
    ClassTags = ClassSection.find_all('td', attrs={"class" : "name-cell"})
    for ClassTag in ClassTags:
        ClassUrlTag = ClassTag.find('a')
        if not ClassUrlTag:
            print(f"not found class urltag{ClassTag}")
            continue
        RelClassDocUrl = ClassUrlTag.get("href")
        FullClassUrl = urljoin(ModuleDocUrl, RelClassDocUrl)
        ClassCrawler.CrawlClass(FullClassUrl, data)
        if DEBUG:
            break
        
    return 
        
        
        

### Crawl every module


In [12]:
StartCrawlFlag = False

def CrawlEveryModule(DocUrl="https://docs.unrealengine.com/4.27/en-US/API/", data = [], continueAt = None):
    global StartCrawlFlag
    Response = requests.get(DocUrl)
    Soup = BeautifulSoup(Response.text, "html.parser")
    ModulelistTags = Soup.find_all("div", attrs={"class" : "modules-list"})
    try:
        for Modulelist in ModulelistTags:
            UrlTags = Modulelist.find_all("td", attrs={"class" : "name-cell"})
            for UrlTag in UrlTags:
                if continueAt:
                    ModuleName = (UrlTag.text).strip("\n")
                    if continueAt == ModuleName:
                        StartCrawlFlag = True
                    if not StartCrawlFlag:
                        print(f"skip {ModuleName}")
                        continue
                RelUrlTag = UrlTag.find('a')
                if not RelUrlTag:
                    continue
                RelUrl = RelUrlTag.get("href")
                ModuleUrl = urljoin(DocUrl, RelUrl)
                if not RelUrl:
                    print(UrlTag)
                    continue
                print(ModuleUrl,RelUrl)
                CrawlModule(ModuleUrl, data)
                if DEBUG:
                    break
        return data
    except:
        traceback.print_exc()
        return data
            
            
            
    

In [13]:
# NewData = CrawlEveryModule(continueAt="Engine")


# with open('output2.json', 'w') as f:
#     json.dump(NewData, f, indent=4)

In [14]:
# NewData = CrawlEveryModule(continueAt="Sequencer")
# with open('output3.json', 'w') as f:
#     json.dump(NewData, f, indent=4)

In [15]:
# NewData = CrawlEveryModule(continueAt="DisplayCluster")

# with open('output4.json', 'w') as f:
#     json.dump(NewData, f, indent=4)

In [16]:
NewData = CrawlEveryModule(continueAt="DynamicMesh")

with open('output5.json', 'w') as f:
    json.dump(NewData, f, indent=4)

skip Advertising
skip AIModule
skip Analytics
skip AnalyticsET
skip AnalyticsVisualEditing
skip AnimationCore
skip AnimGraphRuntime
skip AppFramework
skip ApplicationCore
skip AssetRegistry
skip AudioAnalyzer
skip AudioCaptureCore
skip AudioCaptureRtAudio
skip AudioExtensions
skip AudioMixer
skip AudioMixerCore
skip AudioPlatformConfiguration
skip AugmentedReality
skip AutomationMessages
skip AutomationWorker
skip AVEncoder
skip AVIWriter
skip BlueprintRuntime
skip BuildPatchServices
skip BuildSettings
skip Cbor
skip CEF3Utils
skip Chaos
skip ChaosCore
skip ChaosSolverEngine
skip ChaosVehiclesCore
skip ChaosVehiclesEngine
skip CinematicCamera
skip ClothingSystemRuntimeCommon
skip ClothingSystemRuntimeInterface
skip ClothingSystemRuntimeNv
skip CookedIterativeFile
skip Core
skip CoreUObject
skip CrunchCompression
skip D3D12RHI
skip DatasmithCore
skip DeveloperSettings
skip DirectLink
skip Engine
skip EngineMessages
skip EngineSettings
skip EyeTracker
skip FieldSystemEngine
skip Foliage


## Let's bring them together


In [17]:
# Data = CrawlEveryModule()


# with open('output.json', 'w') as f:
#     json.dump(Data, f, indent=4)