In [1]:
"""
MUST READ
---------
1. Set your browser's download location to the location you want because all files will be downloaded to the location you specified in your browser.

2. Before leaving it alone for the night, make sure to enable 'Allow multiple file downloads' in your browser's website setting.

3. Create these files when proceeding for the first time
    -> registered_files.json, bot.log, .secret

Note
----
.secret should contain

    0 -> username,
    1 -> password,
    2 -> download directory (for checking)
    3 -> chromewebdriver location (only if you run it in windows)

Acknowledgement
---------------
Thanks -> pythonjar, MariyaSha (YouTuber) and some other stackoverflow members...

"""

import json
from json.decoder import JSONDecodeError
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

from constants import (
    secrets,
    encoding,
    target_file_type,
    tracker_file_location,
)
from helperFunctions import (
    login,
    updateLog,
    waitNSeconds,
    downloadFile,
    normalizeData,
    loadMoreFiles,
    appendFilesInfo,
    initializeWebpage,
    checkDownloadStatus,
    initializeWebDriver,
    waitToFinishDownload,
    getExistingFilesInfo,
)


driver, web_driver_wait = initializeWebDriver()

login(driver, web_driver_wait)

"""
You should be logged in by now!

Note
----
if you have 2fa on for this account then you have to authorize it and then run the remaining cells manually

"""

facebook_group_url = "https://www.facebook.com/groups/201623576939858/files/"


In [None]:
"""
Targets identifiers
-------------------
option_button_xpath = ('...' 3 dot) button -> finds 15 per-scroll
download_button_xpath = Download that appears after clicking the option button -> finds 1 per-click
post_permalink_xpath = Permanent link of the post that posted this file -> finds 15 per-scroll

name_xpath = Name of the file -> finds 15 per-scroll
type_date_xpath = Type(pdf/docx) and uploaded date of the file -> finds 30 per-scroll

WARNING
-------
name_xpath, type_date_xpath -> this identifiers will change continuously so update it according to your needs 

"""

option_button_xpath = "//div[@aria-label='File options']"
download_button_xpath = "//a[contains(@href, 'https://www.facebook.com/download/')]"
post_permalink_xpath = (
    "//a[contains(@href, 'https://www.facebook.com/groups/201623576939858/permalink/')]"
)

name_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p oo9gr5id hzawbc8m']"
type_date_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d9wwppkn iv3no6db e9vueds3 j5wam9gi b1v8xokw oo9gr5id hzawbc8m']"

initializeWebpage(driver, facebook_group_url)


In [None]:
j, k = 0, 1

registered_files_count = 0

# Refresh existing files information every time this cell loads
downloaded_file_list, registered_file_list = getExistingFilesInfo()

# Total files registered in the tracker file (registered_files.json)
if registered_file_list:
    with open(tracker_file_location, "r", encoding=encoding) as f:
        try:
            # Last registered file id
            registered_files_count = int(json.load(f)["files"][-1]["id"])
        except JSONDecodeError:
            pass


# Grabbing initial loadeded targets
try:
    file_option_buttons = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, option_button_xpath))
    )
    file_names = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, name_xpath))
    )
    file_types_and_dates = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, type_date_xpath))
    )

    # This is the unique identifier of the post which contains files as attachment(s)
    post_permalink = web_driver_wait.until(
        EC.presence_of_all_elements_located((By.XPATH, post_permalink_xpath))
    )
except TimeoutException:
    print("XPATH's identifier (class names) has changed again! 🤯")


""" MAIN LOOP """
for idx, button in enumerate(file_option_buttons):
    try:
        """
        CAUTION
        -------
        We need to make SURE to NORMALIZE all data before passing it to the 'searchFile()'
        otherwise when comparing '>, <, ==' it will give unexpected results

        'downloaded_file_list' and 'registered_file_list' should have all of their datas normalized when created

        """
        _name = normalizeData(file_names[idx].text)
        _type = normalizeData(file_types_and_dates[j].text)
        _date = normalizeData(file_types_and_dates[k].text)
        _permalink = normalizeData(post_permalink[idx].get_attribute("href"))
        _post_id = int(_permalink.split("/")[-2])

        # as 'file_types_and_dates' contains both file-type and date
        j += 2
        k += 2

        # Scrolling after it reaches at the end of the list to load more files
        if button is file_option_buttons[-1]:
            [
                file_option_buttons,
                file_names,
                file_types_and_dates,
                post_permalink,
            ] = loadMoreFiles(
                driver,
                [file_option_buttons, file_names, file_types_and_dates, post_permalink],
                [
                    option_button_xpath,
                    name_xpath,
                    type_date_xpath,
                    post_permalink_xpath,
                ],
            )

            updateLog("\nTotal files loaded: {}\n".format(len(file_option_buttons)))

        """
        Download pdf files only

        Note
        ----
        Cannot check this before the scroll because here is a possibility to have more than one page worth of non-pdf files
        
        """
        if not _type == target_file_type.upper():
            updateLog(
                '\n😪 Skipping ({}): "{} --- {}", 🤔 Reason: FILE_TYPE: "{}"'.format(
                    idx + 1, _name, _date, _type
                )
            )
            continue

        is_downloaded, update_tracker_file = checkDownloadStatus(
            _post_id, _name, _date, downloaded_file_list, registered_file_list
        )

        if is_downloaded:
            updateLog(
                '😪 Skipping ({}): "{} --- {}", 🤔 Reason: ALREADY_DOWNLOADED'.format(
                    idx + 1, _name, _date
                )
            )
            continue

        """
        If everything is OK then initializing file download 😁
        
        web_driver_wait.until(EC.element_to_be_clickable(button)).click()
        -> this was giving me -> ElementClickInterceptedException

        """
        # Clicking the more ('...' 3 dot) button
        driver.execute_script("arguments[0].click();", button)

        waitNSeconds()

        # Try to download the file
        registered_files_count = downloadFile(
            driver, web_driver_wait, download_button_xpath, registered_files_count
        )

        """
        Waiting until the file gets downloaded completely

        Note
        ----
        We have to wait for the file to be downloaded, if we continue without it there maybe some files
        that couldn't be downloaded on time (before the link expire) and later on those files can't be resumed
        
        """
        download_time = waitToFinishDownload(secrets[2])

        """
        Updating tracker file after a successful file download 😎

        Choices (If you loose internet connection during running this Bot)
        ------------------------------------------------------------------
        1. If you call this before the 'waitToFinishDownload()' then the file will be registered
        but actually it was not downloaded
        -> This file will never be downloaded again

        (default) 2. If you call this after the 'waitToFinishDownload()' then there can be a situation where
        the file has been downloaded successfully but it was not registered
        -> There will be duplicate download of the same file

        """
        if update_tracker_file:
            appendFilesInfo(
                {
                    "id": registered_files_count,
                    "post_id": _post_id,
                    "type": _type,
                    "name": _name,
                    "uploaded_date": _date,
                }
            )

        # Waiting 30 minutes before stoping the Bot for slow/no internet 😟
        if download_time == -1:
            break

        """
        Updating info after a successful file download 😎

        Note
        ----
        Waiting at least 2s before every download request for safety 😅
        
        """
        updateLog('✔ DONE ({}): "{}", "{}"'.format(idx + 1, _name, _date))

        # For testing.............
        break

    except Exception as e:
        updateLog("\n*** ERROR at {}, date: {} ***\n--> {}\n".format(_name, _date, e))


# updateLog(
#     "\n\n\n👏👏👏 Completed 👏👏👏\n\nFiles downloaded: {} 😉\n".format(registered_files_count)
# )
# updateLog(
#     "Go ahead and give this https://github.com/ShahriarDhruvo/Bots a star, thanks\t--- SED"
# )


# driver.close() # not closing because I had to run this cell multiple times during test
