In [None]:
"""
MUST READ
---------
1. In 'constants.py,' change the value of 'download directory' to match yours and any other configuration settings according to your needs.
2. Make a '.secret' file and fill it with the essential information.

Note
----
.secret should contain

    0 -> username,
    1 -> password,
    2 -> chromewebdriver location (only if you run it in windows, on Linux just install the chromedriver package)

Acknowledgement
---------------
Thanks -> pythonjar, MariyaSha (YouTuber) and some other stackoverflow members...

"""

import time
import json
from json.decoder import JSONDecodeError
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

from constants import (
    ENCODING,
    FILE_URL,
    NAME_XPATH,
    TYPE_DATE_XPATH,
    TARGET_FILE_TYPE,
    DOWNLOAD_DIRECTORY,
    OPTION_BUTTON_XPATH,
    POST_PERMALINK_XPATH,
    TRACKER_FILE_LOCATION,
    DOWNLOAD_BUTTON_XPATH,
)
from helperFunctions import (
    login,
    updateLog,
    waitNSeconds,
    scrollNTimes,
    downloadFile,
    normalizeData,
    appendFilesInfo,
    insertFoundFiles,
    initializeWebpage,
    checkDownloadStatus,
    initializeWebDriver,
    waitToFinishDownload,
    getExistingFilesInfo,
)


driver, web_driver_wait = initializeWebDriver()

login(driver, web_driver_wait)

"""
You should be logged in by now!

Note
----
if you have 2fa on for this account then you have to authorize it and then run the remaining cells manually

"""

initializeWebpage(driver, FILE_URL)


In [None]:
# First load 35 * 15 = 525 files
scroll_count = scrollNTimes(driver)

j, k = 0, 1

registered_files_count = 0

# Refresh existing files information every time this cell loads
downloaded_file_list, registered_file_list = getExistingFilesInfo()

# Total files registered in the tracker file (registered_files.json)
if registered_file_list:
    with open(TRACKER_FILE_LOCATION, "r", encoding=ENCODING) as f:
        try:
            # Last registered file id
            registered_files_count = int(json.load(f)["files"][-1]["id"])
        except JSONDecodeError:
            pass


# Grabbing initial loadeded targets
try:
    start_time = time.time()

    file_option_buttons = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, OPTION_BUTTON_XPATH))
    )
    file_names = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, NAME_XPATH))
    )
    file_types_and_dates = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, TYPE_DATE_XPATH))
    )

    # This is the unique identifier of the post which contains files as attachment(s)
    post_permalink = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, POST_PERMALINK_XPATH))
    )

    end_time = time.time()
    updateLog(
        "\nFound {} files in {}s".format(
            len(file_option_buttons), round(end_time - start_time, 2)
        )
    )
except TimeoutException:
    updateLog("Error: XPATH's identifier (class names) has changed again! 🤯")


""" MAIN LOOP """
for idx, button in enumerate(file_option_buttons):
    try:
        """
        CAUTION
        -------
        We need to make SURE to NORMALIZE all data before passing it to the 'searchFile()'
        otherwise when comparing '>, <, ==' it will give unexpected results

        'downloaded_file_list' and 'registered_file_list' should have all of their datas normalized when created

        """
        _name = normalizeData(file_names[idx].text)
        _type = normalizeData(file_types_and_dates[j].text)
        _date = normalizeData(file_types_and_dates[k].text)
        _permalink = normalizeData(post_permalink[idx].get_attribute("href"))
        _post_id = int(_permalink.split("/")[-2])

        # as 'file_types_and_dates' contains both file-type and date
        j += 2
        k += 2

        # Scrolling after it reaches at the end of the list to load more files
        if button is file_option_buttons[-1]:
            scroll_count = scrollNTimes(driver, scroll_count)
            start_time = time.time()

            [
                file_option_buttons,
                file_names,
                file_types_and_dates,
                post_permalink,
            ] = insertFoundFiles(
                driver,
                [file_option_buttons, file_names, file_types_and_dates, post_permalink],
                [
                    OPTION_BUTTON_XPATH,
                    NAME_XPATH,
                    TYPE_DATE_XPATH,
                    POST_PERMALINK_XPATH,
                ],
            )

            end_time = time.time()
            updateLog(
                "\nTotal files loaded: {} in {}s\n".format(
                    len(file_option_buttons), round(end_time - start_time, 2)
                )
            )

        """
        Download pdf files only

        Note
        ----
        Cannot check this before the scroll because here is a possibility to have more than one page worth of non-pdf files
        
        """
        if not _type == TARGET_FILE_TYPE.upper():
            updateLog(
                '\n😪 Skipping ({}): "{} --- {}", 🤔 Reason: FILE_TYPE: "{}"'.format(
                    idx + 1, _name, _date, _type
                )
            )
            continue

        is_downloaded, update_tracker_file = checkDownloadStatus(
            _post_id, _name, _date, downloaded_file_list, registered_file_list
        )

        if is_downloaded:
            updateLog(
                '😪 Skipping ({}): "{} --- {}", 🤔 Reason: ALREADY_DOWNLOADED'.format(
                    idx + 1, _name, _date
                )
            )
            continue

        """
        If everything is OK then initializing file download 😁
        
        web_driver_wait.until(EC.element_to_be_clickable(button)).click()
        -> this was giving me -> ElementClickInterceptedException

        """
        # Clicking the more ('...' 3 dot) button
        driver.execute_script("arguments[0].click();", button)

        waitNSeconds()

        # Try to download the file
        registered_files_count = downloadFile(
            driver, web_driver_wait, DOWNLOAD_BUTTON_XPATH, registered_files_count
        )

        """
        Waiting until the file gets downloaded completely

        Note
        ----
        We have to wait for the file to be downloaded, if we continue without it there maybe some files
        that couldn't be downloaded on time (before the link expire) and later on those files can't be resumed
        
        """
        download_time = waitToFinishDownload(DOWNLOAD_DIRECTORY)

        """
        Updating tracker file after a successful file download 😎

        Choices (If you loose internet connection during running this Bot)
        ------------------------------------------------------------------
        1. If you call this before the 'waitToFinishDownload()' then the file will be registered
        but actually it was not downloaded
        -> This file will never be downloaded again

        (default) 2. If you call this after the 'waitToFinishDownload()' then there can be a situation where
        the file has been downloaded successfully but it was not registered
        -> There will be duplicate download of the same file

        """
        if update_tracker_file:
            appendFilesInfo(
                {
                    "id": registered_files_count,
                    "type": _type,
                    "post_id": _post_id,
                    "name": _name,
                    "uploaded_date": _date,
                }
            )

        # Waiting 30 minutes before stoping the Bot for slow/no internet 😟
        if download_time == -1:
            break

        """
        Updating info after a successful file download 😎

        Note
        ----
        Waiting at least 2s before every download request for safety 😅
        
        """
        updateLog('✔ DONE ({}): "{}", "{}"'.format(idx + 1, _name, _date))

        # For testing.............
        # break

    except Exception as e:
        updateLog("\n*** ERROR at {}, date: {} ***\n--> {}\n".format(_name, _date, e))


updateLog(
    "\n\n\n👏👏👏 Completed 👏👏👏\n\nFiles downloaded: {} 😉\n".format(registered_files_count)
)
updateLog(
    "Go ahead and give this repo (https://github.com/ShahriarDhruvo/Bots) a star, thanks\t--- SED"
)


# driver.close() # not closing because I had to run this cell multiple times during test
