In [1]:
"""
Acknowledgement
---------------
1. set your browser download location to your desire location because all files will be downloaded to the download location setted in your browser.
2. create these files -> files_info.json, bot.log, .secret

Thanks -> pythonjar, MariyaSha and some other stackoverflow members...

"""
# Packages & Libraries
import json
from json.decoder import JSONDecodeError
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC

# Custom imports
from constants import (
    secrets,
    encoding,
    target_file_type,
    tracker_file_location,
)
from helperFunctions import (
    login,
    updateLog,
    waitNSeconds,
    downloadFile,
    loadMoreFiles,
    appendFilesInfo,
    initializeWebpage,
    checkDownloadStatus,
    initializeWebDriver,
    waitToFinishDownload,
    getExistingFilesInfo,
)


In [2]:
driver, wait = initializeWebDriver()
login(driver, wait)


In [3]:
initializeWebpage(driver, "https://www.facebook.com/groups/201623576939858/files/")

"""
 targets identifiers -> this identifiers will change continuously so update it according to your needs 

"""
download_button_cssSelector = "a[href*='https://www.facebook.com/download/']"
permalink_xpath = "//a[contains(@href, 'https://www.facebook.com/groups/201623576939858/permalink/')]"  # by default finds 15 per-scroll
fileOption_xpath = "//div[@aria-label='File options']"  # by default finds 15 per-scroll

# Target identifiers
# They change too often so they have to be present here
fileName_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p oo9gr5id hzawbc8m']"  # find 15 per-scroll
fileTypeDate_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d9wwppkn iv3no6db e9vueds3 j5wam9gi b1v8xokw oo9gr5id hzawbc8m']"  # find 30 per-scroll


In [23]:
j, k = 0, 1

registered_files_count = 0

# Refresh existing files info every time this cell loads
downloaded_files, tracked_files = getExistingFilesInfo()

# Total files registered in the tracker file(files_info.json)
if tracked_files:
    with open(tracker_file_location, "r", encoding=encoding) as f:
        try:
            # Last registered file id
            registered_files_count = int(json.load(f)["files"][-1]["id"])
        except JSONDecodeError:
            pass

# Grabbing initial loadeded target
try:
    file_option_buttons = wait.until(
        EC.presence_of_all_elements_located((By.XPATH, fileOption_xpath))
    )
    file_names = wait.until(
        EC.presence_of_all_elements_located((By.XPATH, fileName_xpath))
    )
    file_types_and_dates = wait.until(
        EC.presence_of_all_elements_located((By.XPATH, fileTypeDate_xpath))
    )

    # This is the unique identifier of a post/file
    post_permalink = wait.until(
        EC.presence_of_all_elements_located((By.XPATH, permalink_xpath))
    )
except TimeoutException:
    print("XPATH's identifier (class names) has changed again! 🤯")


""" MAIN LOOP """
for idx, button in enumerate(file_option_buttons):
    try:
        _name = file_names[idx].text
        _type = file_types_and_dates[j].text
        _date = file_types_and_dates[k].text
        _permalink = post_permalink[idx].get_attribute("href")

        # as 'file_types_and_dates' contains both file-type and date
        j += 2
        k += 2

        # Scrolling after it reaches at the end of the list to load more files
        if button is file_option_buttons[-1]:
            [
                file_option_buttons,
                file_names,
                file_types_and_dates,
                post_permalink,
            ] = loadMoreFiles(
                driver,
                [file_option_buttons, file_names, file_types_and_dates, post_permalink],
                [fileOption_xpath, fileName_xpath, fileTypeDate_xpath, permalink_xpath],
            )

            updateLog("\nTotal Loaded Files: {}\n".format(len(file_option_buttons)))

        """
        Download pdf files only
        Cannot check this before the scroll because
        here is a possibility to have more than one page worth of non-pdf files
        
        """
        if not _type == target_file_type.upper():
            updateLog(
                '\n😪 Skipping ({}): "{} --- {}", 🤔 Reason: FILE_TYPE: "{}"'.format(
                    idx + 1, _name, _date, _type
                )
            )
            continue

        is_downloaded, update_tracker_file = checkDownloadStatus(
            _date, _permalink, _name, downloaded_files, tracked_files
        )
        if is_downloaded:
            updateLog(
                '😪 Skipping ({}): "{} --- {}", 🤔 Reason: ALREADY_DOWNLOADED'.format(
                    idx + 1, _name, _date
                )
            )
            continue

        """
        If everything is OK then initializing file download 😁
        
        wait.until(EC.element_to_be_clickable(button)).click()
        -> this was giving me -> ElementClickInterceptedException

        """
        # Clicking the more ('...' 3 dot) button
        driver.execute_script("arguments[0].click();", button)

        waitNSeconds()

        # Try to download the file
        registered_files_count = downloadFile(
            driver, wait, download_button_cssSelector, registered_files_count
        )

        """
        Waiting until the file gets downloaded completely
        We have to wait for the file to be downloaded, if we continue without it there maybe some files
        that couldn't be downloaded on time (before the link expire) and later on those files can't be resumed
        
        """
        download_time = waitToFinishDownload(secrets[2], registered_files_count)

        """
        Updating tracker file after a successful file download 😎

        Choices (If you loose internet connection during running this Bot)
        ------------------------------------------------------------------
        1. If you call this before the 'waitToFinishDownload()' then the file will be registered
        but actually it was not downloaded
        -> This file will never be downloaded again

        (default) 2. If you call this after the 'waitToFinishDownload()' then there can be a situation where
        the file has been downloaded successfully but it was not registered
        -> There will be duplicate download of the same file

        """
        if update_tracker_file:
            appendFilesInfo(
                {
                    "id": registered_files_count,
                    "type": _type,
                    "name": _name,
                    "uploaded_date": _date,
                    "post_permalink": _permalink,
                }
            )

        # Waiting 600s before stoping the Bot for slow/no internet 😟
        if download_time == -1:
            break

        """ Waiting at least 2s before every download request for safety 😅 """

        # Updating info after a successful file download 😎
        updateLog('✔ DONE ({}): "{}", "{}"'.format(idx + 1, _name, _date))

        for line in tracked_files:
            print(line)
        break

    except Exception as e:
        updateLog("\n*** ERROR at {}, date: {} ***\n--> {}\n".format(_name, _date, e))


# updateLog(
#     "\n\n\n👏👏👏 Completed 👏👏👏\n\nFiles downloaded: {} 😉\n".format(registered_files_count)
# )
# updateLog(
#     "Go ahead and give this https://github.com/ShahriarDhruvo/Bots a star, thanks\t--- SED"
# )


# driver.close() # not closing because I had to run this cell multiple times during test


😪 Skipping (1): "তিন তারে এক সুরে বেঁধেছে.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (2): "তারা.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (3): "দিদি.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (4): "হৃদয়ের এঁড়ে বাছুর.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (5): "যুগবাণী.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (6): "গল্প.pdf --- March 16, 2022 at 8:43 AM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (7): "সন্ধ্যাবেলায়, প্রতিদিন রোববার.pdf --- March 15, 2022 at 8:20 PM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (8): "হনিমুন, বিভূতিভূষণ মুখোপাধ্যায়.pdf --- March 15, 2022 at 8:20 PM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (9): "প্রথমা - প্রতিভা বসু.pdf --- March 15, 2022 at 7:23 PM", 🤔 Reason: ALREADY_DOWNLOADED
😪 Skipping (10): "রহস্য সমগ্র - রবীন্দ্রনাথ ঠাকুর.pdf --- March 16, 2022 at 12:33 AM", 🤔 Reason: ALREADY_DOWNLOADE

In [22]:
import os
import time
import glob
import json
import string
import platform
import unicodedata
from selenium import webdriver
from json.decoder import JSONDecodeError
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


from constants import (
    secrets,
    encoding,
    target_file_type,
    log_file_location,
    normalization_form,
    explicit_wait_time,
    tracker_file_location,
    network_failure_timeout,
)


def compareString(s1, s2):
    # string will be normalized before coming here
    # s1 = unicodedata.normalize(normalization_form, s1)
    # s2 = unicodedata.normalize(normalization_form, s2)

    # Removing whitespace in the string before comparing
    # because when file saves in the machine it seems to add whitepsaces after '-'
    # remove = string.punctuation + string.whitespace
    remove = string.whitespace
    mapping = {ord(c): None for c in remove}

    return s1.translate(mapping) == s2.translate(mapping)


def binarySearch(item, itemList, multipleCheck=False):
    left = 0
    right = len(itemList) - 1

    while left <= right:
        mid = left + (right - left) // 2

        if multipleCheck:
            """
            Sorted By: uploaded date

            If permalink exist for the file then check:
                0 uploaded date
                1 permalink of the post
                2 file name
            If not then check:
                0 uploaded date
                2 file name

            """
            # updateLog("\n\n{} -- {}".format(itemList[mid], item))

            # There can be some cases where there is no permalink in the tracker file (For old files)
            if itemList[mid][1]:
                if (
                    compareString(itemList[mid][0], item[0])
                    and itemList[mid][1] == item[1]
                    and compareString(itemList[mid][2], item[2])
                ):
                    return mid
                elif itemList[mid][0] > item[0]:
                    right = mid - 1
                else:
                    left = mid + 1
            else:
                if compareString(itemList[mid][0], item[0]) and compareString(
                    itemList[mid][2], item[2]
                ):
                    return mid
                elif itemList[mid][0] > item[0]:
                    right = mid - 1
                else:
                    left = mid + 1

        else:
            if compareString(itemList[mid], item):
                return mid
            elif itemList[mid] > item:
                right = mid - 1
            else:
                left = mid + 1

    return -1


def checkDownloadStatus(
    upload_date, post_permalink, file_name, downloaded_files, tracked_files
):
    """
    Check If the requested file has already been downloaded or not

    Scenarios
    ---------
    1. If either tracker file or download directory is empty then there was no previous
    attempt to download these files in this case all files should be downloaded
    -> return False

    2. If the file is not present in the tracker file(files_info.json) then the file that
    has been requested to download didn't get to download previously
    -> return False

    3. If the file is not present in the download directory but has already been
    added to tracker file(files_info.json) then it was not downloaded properly
    -> return False

    4. If the file exist both in tracker file(files_info.json) & in the download directory
    then the file has been downloaded
    -> return True

    5. File present in the download directory but there is no log for that file in files_info.json
    -> this shouldn't happen in any situation

    6. All files has been checked once then this shouldn't be checked anymore because there can be
    multiple files with the same name(in the website) and they all should be downloaded
    remove the found file so that the file with same name can be downloaded later on


    Return Value
    ------------
    First -> Should it download the requested file or not
    Second -> Update traker file or not

    """

    if not tracked_files or not downloaded_files:
        updateLog("\nTracker File empty")
        return False, True

    """
    Normalizing all data that are needed to be compared
    Normalizing to avoid comparison between different unicode char thus unexpected results

    Caution
    -------
    We need to make SURE to NORMALIZE all data before passing it to the 'binarySearch()'
    otherwise when comparing '>, <, ==' it will give unexpected results

    'downloaded_files' and 'tracked_files' should have all of their datas normalized when created

    """
    file_name = unicodedata.normalize(normalization_form, file_name)
    upload_date = unicodedata.normalize(normalization_form, upload_date)
    post_permalink = unicodedata.normalize(normalization_form, post_permalink)

    """
    item
    ----
    0 uploaded date
    1 permalink of the post
    2 file name

    """
    trackedFileIndex = binarySearch(
        (upload_date, post_permalink, file_name), tracked_files, True
    )

    # if not trackedFileIndex: # because index can be 0
    if trackedFileIndex == -1:
        updateLog("\nNot Found in the tracker-file")
        return False, True

    # Turned off for duplication
    # downloadedFileIndex = binarySearch(file_name, downloaded_files)
    # if trackedFileIndex and downloadedFileIndex == -1:
    #     updateLog(
    #         "\n*** Info does exist in the tracker-file but file doesn't exist in the download directory ***"
    #     )
    #     return False, False

    # downloaded_files.pop(downloadedFileIndex)\

    tracked_files.pop(trackedFileIndex)

    return True, True
