In [17]:
import requests
from bs4 import BeautifulSoup
import bs4
import json
from urllib.parse import urljoin

### Crawl a class

In [18]:
DEBUG = False

class ClassDocCrawler:
    def __init__(self) -> None:
        pass
            
    def CrawlClass(self, InClassDocUrl : str, data=[]):
        CurrentPageUrl = InClassDocUrl
        self.PageUrl = InClassDocUrl
        response = requests.get(CurrentPageUrl)
        soup = BeautifulSoup(response.text, 'html.parser', from_encoding="ISO-8859-1")
        paragraphs = soup.find_all('p')
        HeaderName, ModuleName, IncludePath = None, None, None
        ClassName = None
        Title = soup.find("div", attrs={"id" : "pageTitle"})
        if Title:
            ClassName = Title.text
        for idx, node in enumerate(paragraphs):
            # print(node.text)
            if node.text == "Header" and idx < len(paragraphs) - 1:
                HeaderName = paragraphs[idx + 1].text
            if node.text == "Module" and idx < len(paragraphs) - 1:
                ModuleName = paragraphs[idx + 1].text
            if node.text == "Include" and idx < len(paragraphs) - 1:
                IncludePath = paragraphs[idx + 1].text
            
            
        Rows = soup.find_all('tr',attrs={"class":"normal-row"})

        for Row in Rows[3:]:
            Description, Type, Name = self.ExtractFromRow(Row)
            if Name is None:
                continue
            data.append({
                "Module" : ModuleName,
                "HeaderPath" : IncludePath,
                "ClassName" : ClassName,
                "Name": Name,
                "Type": Type,
                "Description": Description,
            })
            if DEBUG:
                break
            
        return data

    def FilterText(self, InText : str) -> str:
        return InText.replace('Â', '').replace('\u00a0','').replace('\n\n','').strip('\n\u00a0\t\n\r')

    def IsShortened(self, InText: str) -> bool:
        return InText.endswith("...")

    def FindFullNameFromUrl(self, InTag: bs4.element.Tag) -> str:
        UrlTag = InTag.find('a')
        
        if not UrlTag:
            return self.FilterText(InTag.text)
        
        RelUrl = UrlTag.get('href')
        ResultUrl = urljoin(self.PageUrl, RelUrl)
        
        DetailResponse = requests.get(ResultUrl)
        DetailSoup = BeautifulSoup(DetailResponse.text, 'html.parser', from_encoding="ISO-8859-1")
        Name = DetailSoup.find('h1',attrs={'id' : 'H1TitleId'})
        return Name.text
    
    def ExtractFromRow(self, Row: bs4.element.Tag) -> None:
        Descriptions = Row.find_all("td", attrs={"class" : "desc-cell"})
        Other = Row.find_all("td", attrs={"class" : "name-cell"})
        
        if len(Descriptions) != 1 or len(Other) != 2:
            # print(Row,"\n __________ \n")
            return None, None, None
        
        Description = self.FilterText( Descriptions[0].text)
        Type = self.FilterText(Other[0].text)
        Name = self.FilterText(Other[1].text)
        
        if self.IsShortened(Description):
            Description = self.FindFullNameFromUrl(Descriptions[0])
        if self.IsShortened(Type):
            Type = self.FindFullNameFromUrl(Other[0])
        if self.IsShortened(Name):
            Name = self.FindFullNameFromUrl(Other[1])
        
        return Description, Type, Name


### Crawl content from every Class from module

In [19]:
def CrawlModule(ModuleDocUrl : str, data = []):
    ClassCrawler = ClassDocCrawler()
    
    response = requests.get(ModuleDocUrl)
    Soup = BeautifulSoup(response.text, 'html.parser')
    
    ClassSection = Soup.find("div", attrs={"id" : "classes"})
    if not ClassSection:
        print(f"no class section in {ModuleDocUrl}")
        return
    ClassTags = ClassSection.find_all('td', attrs={"class" : "name-cell"})
    for ClassTag in ClassTags:
        ClassUrlTag = ClassTag.find('a')
        if not ClassUrlTag:
            print(f"not found class urltag{ClassTag}")
            continue
        RelClassDocUrl = ClassUrlTag.get("href")
        FullClassUrl = urljoin(ModuleDocUrl, RelClassDocUrl)
        ClassCrawler.CrawlClass(FullClassUrl, data)
        if DEBUG:
            break
        
    return data
        
        
        

## Let's bring them together


In [20]:
Data = CrawlModule("https://docs.unrealengine.com/4.27/en-US/API/Runtime/AIModule/")

# Crawller = ClassDocCrawler()
# Data = Crawller.CrawlClass("https://docs.unrealengine.com/4.27/en-US/API/Runtime/AIModule/AAIController/")
with open('output.json', 'w') as f:
    json.dump(Data, f, indent=4)
    # break
# Description, Type, Name = ExtractFromRow(Rows[5])



<div id="pageTitle">
<h1 id="H1TitleId">AAIController</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">ADetourCrowdAIController</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">AGridPathAIController</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIBasicCounter</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIGenericID</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIMessage</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIMessageObserver</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIMoveCompletedSignature</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIMoveRequest</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAINamedID</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIRequestID</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIResCounter</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIResourceLock</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">FAIResourcesSet</h1>
</div>
<div id="pageTitle">
<h1 id="H1TitleId">F