In [144]:
import os
import time
import pickle
import shutil
import helium as hel
import regex
import chromedriver_autoinstaller
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver import ChromeOptions
from secrets import * # user-made secrets.py which contains email and password variables

In [145]:
def pklSave(contentToBeSaved, fullPath):
    with open(fullPath, 'wb') as f:
        pickle.dump(contentToBeSaved, f)

def pklLoad(fullPath):
    with open(fullPath, 'rb') as f:
        content = pickle.load(f)
    return content

def pklForceLoad(path, dtype = 'dict'):
    try:
        content = pklLoad(path)
        return content
    except Exception as e:
        if dtype == 'list':
            pklSave([], path)
            return []
        else:
            pklSave({}, path)
            return {}

# more about naming standards for path components here: https://stackoverflow.com/questions/2235173/what-is-the-naming-standard-for-path-components
def joinPaths(baseDirectory, relativePath):
    return os.path.normpath(os.path.join(baseDirectory, relativePath))

def changeSep(path, newSep, oldSep=os.path.sep): 
    # source: https://stackoverflow.com/questions/18707338/print-raw-string-from-variable-not-getting-the-answers#:~:text=To%20turn%20a%20variable%20to%20raw%20str%2C%20just%20use
    return os.path.normpath(rf"{path}").replace(os.path.sep, newSep) # source: https://mail.python.org/pipermail/tutor/2011-July/084788.html

# Credits (source): https://medium.com/cubemail88/automatically-download-chromedriver-for-selenium-aaf2e3fd9d81
def updateChromeDriver():
    """
    Installs newest version of ChromeDriver.exe and adds it to path that Helium will read from.
    Therefore, your chrome browser should be updated to the latest version
    """
    os.environ['WDM_LOG_LEVEL'] = '0'
    os.environ['WDM_LOCAL'] = '1'
    oldPath = ChromeDriverManager().install()
    helium_chromedriver_path = r'.conda\Lib\site-packages\helium\_impl\webdrivers\windows'
    newPath = oldPath[:oldPath.find('.wdm')] + helium_chromedriver_path + '\\' + oldPath.split('\\')[-1]
    os.replace(oldPath, newPath) # moves installed chromedriver.exe from old path to new path
    shutil.rmtree('.wdm/') # removes old path

In [146]:
# automatically updates chromedriver to latest version that matches browser version
chromeDriverFilePath = chromedriver_autoinstaller.install()

In [147]:
downloadLocFullPath = os.path.abspath("downloads")
options = ChromeOptions()
options.add_argument('--disable-notifications')
options.add_experimental_option('prefs', {
"download.default_directory": downloadLocFullPath, #Change default directory for downloads
"download.prompt_for_download": False, #To auto download the file
"download.directory_upgrade": True,
"plugins.always_open_pdf_externally": True #It will not show PDF directly in chrome
})
browser = hel.start_chrome(options=options, headless=False)

In [148]:
# method to get the downloaded file name, source: https://stackoverflow.com/questions/34548041/selenium-give-file-name-when-downloading#:~:text=Here%20is%20another%20simple%20solution%2C%20where%20you%20can%20wait%20until%20the%20download%20completed%20and%20then%20get%20the%20downloaded%20file%20name%20from%20chrome%20downloads.
def getDownLoadedFileName(waitTime):
    # switch to new tab
    browser.switch_to.window(browser.window_handles[-1])
    # navigate to chrome downloads
    browser.get('chrome://downloads')
    # define the endTime
    endTime = time.time() + waitTime
    while True:
        time.sleep(3)
        try:
            browser.find_element_by_id('pauseOrResume')
            # # get downloaded percentage
            # downloadPercentage = browser.execute_script(
            #     "return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('#progress').value")
            # # check if downloadPercentage is 100 (otherwise the script will keep waiting)
            # if downloadPercentage == 100:
            #     # return the file name once the download is completed
            #     return browser.execute_script("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content  #file-link').text")
        except:
            # return the file name once the download is completed
            return browser.execute_script("return document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList downloads-item').shadowRoot.querySelector('div#content  #file-link').text")
            # pass
        if time.time() > endTime:
            return ''

def moveFile(fileName, source, dest): # source: https://www.geeksforgeeks.org/python-move-and-overwrite-files-and-folders/
    # check if file already exists in destination or doesn't exist in source
    if os.path.isfile(joinPaths(dest, fileName)) or not os.path.isfile(joinPaths(source, fileName)):
        return
    shutil.move(joinPaths(source, fileName), dest)

def get_modules_content(faculty = 'Informatics and Computer Science', moduleName='all', signedIn=False, baseRelativePath='downloads', minsTillDownloadIsCancelled = 1):
    # chooses learn1 or learn2.bue.edu.eg based on your chosen faculty
    elearningDomain = 2
    firstDomainFaculties = ['Informatics and Computer Science (ICS)'.lower(), 'Arts & Design'.lower(), 'Energy & Environmental Engineering'.lower()]
    firstDomainFaculties = [facultyName.lower() for facultyName in firstDomainFaculties]
    # alternatively, you can copy code in debugging cell 1 and paste it here to dynamically get domain faculties in  learn1.bue.edu.eg
    for facName in firstDomainFaculties:
        if faculty.lower() in facName:
            elearningDomain = 1
            break

    browser.get(f'https://learn{elearningDomain}.bue.edu.eg/')
    
    # ... signs you in if you're not signed in :D
    if not signedIn:
        hel.write(email[:email.find('@')], into='Username')
        hel.write(password, into='Password')
        hel.click('Log in')
    time.sleep(5)

    # retrieving all or 1 module link(s) and name(s) based on moduleName parameter
    moduleLinks = []
    moduleNames = []
    moduleCards = browser.find_elements_by_xpath("//div[@class='card dashboard-card']")
    for divElem in moduleCards: # go through the modules (needs the modules to be in cards layout)
        aElem = divElem.find_element_by_xpath(".//descendant::a[2]") # so we'll use find_element_by_tag_name() instead: 
        curModuleName = aElem.find_element_by_xpath('(.//child::span)[last()]').text # and now xpath is working again...
        if moduleName == 'all' or moduleName.lower() in curModuleName.lower(): 
            moduleLinks.append(aElem.get_attribute('href'))
            moduleNames.append(curModuleName.replace('&', 'and').replace(':', ',').title())
    
    # doing the comments below for 1 or all modules (depending on moduleName parameter) 
    for i in range(len(moduleLinks)): 
        # creating module folder
        modulePath = joinPaths(baseRelativePath, moduleNames[i])
        if not os.path.exists(modulePath):
            os.makedirs(modulePath)
        
        # downloading module content
        browser.get(moduleLinks[i])
        time.sleep(3) # waiting for the page to render
        moduleCardsCount = 1
        tileUrls = []
        while True: # breaks when id of 'tile-{moduleCardsCount}' is not found in the page (i.e., all module cards are checked)
            try:
                tileUrls.append(browser.find_element_by_id(f'tile-{moduleCardsCount}').find_element_by_xpath('.//child::a[1]').get_attribute('href')) # tile == card, but is closed 
                moduleCardsCount += 1
            except:
                moduleCardsCount -= 1
                break
            
        for j in range(len(tileUrls)):
            # creating module card folder with folder name same as module card name
            browser.get(tileUrls[j])
            time.sleep(4)
            tileElem = browser.find_element_by_id(f'sectiontitle{j+1}')
            cardName = tileElem.text.replace(':', ',').replace('&', 'and').title() # title() --> makes first letter of each word uppercase
            moduleCardPath = joinPaths(modulePath, cardName)
            if not os.path.exists(moduleCardPath):
                os.makedirs(moduleCardPath)

            # getting links (i.e., URLs) for module card contents
            cardElem = None
            while cardElem is None: # stops when tile click is rednered and tile is opened to become a card
                try:
                    cardElem = browser.find_element_by_id(f'section-{j+1}')
                except:
                    pass
            cardMaterialLinks = []
            for aElem in cardElem.find_elements_by_tag_name('a'):
                materialLink = aElem.get_attribute('href')
                if materialLink is not None and 'resource/view' in materialLink:
                    cardMaterialLinks.append(materialLink)

            # downloading module card content 
            openDownloadTabOnce = True
            for materialLink in cardMaterialLinks: # downloads to relativeBasePath, then moves file to moduleName/moduleCardName
                browser.get(materialLink) # downloads, due to setting certain prefs in add_experimental_option()
                if openDownloadTabOnce:
                    browser.execute_script("window.open()")
                    openDownloadTabOnce = False
                materialName = getDownLoadedFileName(minsTillDownloadIsCancelled*60) # gets name of downloaded material file (waits for maximum of 3 minutes, so you should have a fast internet connection)
                if materialName != '':
                    moveFile(materialName, baseRelativePath, moduleCardPath)
            
            if browser.current_url != tileUrls[j]:
                browser.get(tileUrls[j]) # going back to j'th tile page

        
    #    # draft code to download after link renders from .php to .pdf, etc
    #     prevLink = browser.current_url
    #     browser.get(link)
    #     timeout = time.time() + 60*3   # 3 minutes from now
    #     while (prevLink == browser.current_url): # wait until link converts to a file (.pdf, .ppt, etc)
    #         if time.time() > timeout:
    #             break
    #     ... # code to download file
        


In [149]:
get_modules_content(faculty='informatics', moduleName='natural', signedIn=False)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=109.0.5414.120)


In [8]:
cardElem = browser.find_element_by_id(f'section-{1}')
cardElem

<selenium.webdriver.remote.webelement.WebElement (session="c12b280bd0d8c7fe70fac0f294592ea1", element="80b13954-6aa0-421c-89ef-ebb2a5ff0316")>

In [119]:
# debugging cell 1:
# sources:
# for pElem and parent lines: https://stackoverflow.com/questions/69922696/get-the-class-name-of-the-immediate-parent-of-element-with-inner-text-using-sele#:~:text=element%20%3D%20driver.find_element_by_xpath,tag_class%7D%22)
# for children line: https://www.educba.com/xpath-descendant/#:~:text=Descendant%20is%20declared%20as%20%E2%80%98%20//%20%E2%80%98

# browser.get('https://learn1.bue.edu.eg/')
# keyword = 'Dear Student'
# pElem = browser.find_element_by_xpath(f"//*[contains(text(),'{keyword}')]") # gets <p> tag that contains {keyword}
# parent = pElem.find_element_by_xpath("./parent::*") # gets that <p>'s parent
# children = parent.find_elements_by_xpath('//descendant::p') # gets that parent's children that are of tag <p> (which includes the initially found <p> tag)

# faculties = [name.strip() for name in parent.get_attribute('textContent').split('\n')] # 'textContent' is used instead of 'innerHTML' as it gets the actual text inside the html tags & strip() is used to remove leading whitespace
# faculties

# Alternative:
# for pElem in children:
#     print(pElem.get_attribute('textContent').strip()) # 


['Dear Student,',
 'Kindly Login If you belong to one of the following faculties:',
 'Engineering',
 'Energy & Environmental Engineering',
 'Informatics and Computer Science',
 'Arts & Design']

In [None]:
# debugging cell 2:
# links = []
# for divElem in browser.find_elements_by_xpath("//div[@class='card dashboard-card']"): # go through the modules (needs the modules to be in cards layout)
#     module = 'all'
#     # print(divElem.find_element_by_xpath("//child::a").get_attribute('innerHTML')) # for some reason, this returns the root <a> tag, not the child <a> tag of divElem
#     aElem = divElem.find_element_by_tag_name('a') # so we'll use find_element_by_tag_name() instead: 
#     if module == 'all' or module in aElem.find_element_by_xpath('//child::span[3]').text: # and now xpath is working again...
#         links.append(aElem.get_attribute('href'))

In [140]:
# debugging cell 3
# cardElem = browser.find_element_by_id(f'section-1')
# cardMaterialLinks = []
# for aElem in cardElem.find_elements_by_tag_name('a'):
#     materialLink = aElem.get_attribute('href')
#     if materialLink is not None and 'resource/view' in materialLink:
#         cardMaterialLinks.append(materialLink)
# cardMaterialLinks

['https://learn1.bue.edu.eg/mod/resource/view.php?id=143605&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143664',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143664&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143607',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143607&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143609&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143778&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143779&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143610&redirect=1',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143613',
 'https://learn1.bue.edu.eg/mod/resource/view.php?id=143613&redirect=1']

In [141]:
# debugging cell 4
# for materialLink in cardMaterialLinks:
#     browser.get(materialLink)

In [None]:
    browser.get(courseUrl)
    links = browser.find_elements_by_tag_name("a")
    courseUrls = []
    for link in links:
        url = link.get_attribute('href')
        if (url is not None) and ('learn/' in url) and (url not in courseUrls):
            courseUrls.append(url)
    return courseUrls