In [None]:
"""
############### Acknowledgement ###############
1. set your browser download location to your desire location because all files will be downloaded to the download location setted in your browser.

# Thanks -> pythonjar, MariyaSha and some other stackoverflow members...

"""


In [68]:
import os
import time
import glob
import json
import platform
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [None]:
encoding = "utf-8"
target_file_type = "pdf"
log_file_location = "bot_tmp.log"
tracker_file_location = "files_info.json" # this file keeps track of every downloaded file

# For obvious reason 😉
with open(".secret", encoding=encoding) as f:
    """
    0 -> username,
    1 -> password,
    2 -> download directory (for checking)
    3 -> chromewebdriver location (windows)

    """
    secrets = [secret.strip() for secret in f.readlines()]

with open(tracker_file_location, "r", encoding=encoding) as f:
    files_info = json.load(f)["files"]

# already downloaded file's name in download directory
downloaded_files = sorted(
    [os.path.basename(f) for f in glob.glob(secrets[2] + "/*." + target_file_type)]
)

# files that are registered in files_info.json
tracked_files = sorted([(info["name"], info["uploaded_date"]) for info in files_info])

# Ignore all alerts from the webpage
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications": 2}
options.add_experimental_option("prefs", prefs)

# Tested on Windows & Linux
if platform.system() == "Windows":
    service = Service(secrets[3])
    driver = webdriver.Chrome(service=service, options=options)
elif platform.system() == "Linux":
    driver = webdriver.Chrome("chromedriver", options=options)


In [None]:
# open the webpage
driver.get("http://www.facebook.com")

# target credentials
username = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='email']"))
)
password = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CSS_SELECTOR, "input[name='pass']"))
)

# enter username and password
username.clear()
username.send_keys(secrets[0])
password.clear()
password.send_keys(secrets[1])

# target thesrc login button and click it
button = (
    WebDriverWait(driver, 2)
    .until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button[type='submit']")))
    .click()
)

"""
###############################################################################################################################
# It should be logged in! -> if you have 2fa then you have to authorize it manually and then run the remaining cells manually #
###############################################################################################################################

"""


In [None]:
#########################################################
# sleep is important because if we scrape too fast then #
# facebook will detect the bot and block this account   #
#########################################################
def waitNSeconds(sleep_time=1):
    time.sleep(sleep_time)


def writeToFile(text):
    with open(log_file_location, "a", encoding=encoding) as f:
        f.write(text + "\n")


def binarySearch(item, itemList, tupled=False):
    left = 0
    right = len(itemList) - 1

    while left <= right:
        mid = int(left + (right - left) / 2)

        if tupled:
            if (
                itemList[mid][0] == item[0] and itemList[mid][1] == item[1]
            ):  # checking both file name and file uploaded date
                return mid
            elif itemList[mid][0] > item[0]:
                right = mid - 1
            else:
                left = mid + 1
        else:
            if itemList[mid] == item:
                return mid
            elif itemList[mid] > item:
                right = mid - 1
            else:
                left = mid + 1

    return False


def isDownloaded(fileName, uploadDate):
    # if either tracker file or download directory is empty then there was no previous attempt to download these files
    # in this case all files should be downloaded
    if not downloaded_files or not tracked_files:
        return False

    # If the file exist both in tracker file(files_info.json) & in the download location then the file has been downloaded
    downloadedFileIndex = binarySearch(fileName, downloaded_files)
    trackedFileIndex = binarySearch((fileName, uploadDate), tracked_files, True)

    # if the file is not present in the download directory then it was not downloaded
    if not downloadedFileIndex:
        return False

    # if the file is not present in the tracker file(files_info.json) then the file that has been requested to download didn't get to download previously (newly added file)
    if not trackedFileIndex:
        return False

    # all files has been checked once then this shouldn't be checked anymore
    # because there can be multiple files with the same name(in the website) and they all should be downloaded
    # remove the found file so that the file with same name can be downloaded later on
    tracked_files.pop(trackedFileIndex)
    downloaded_files.pop(downloadedFileIndex)

    return True


###################################################################
# scroll down to load more files                                  #
# wait 60s before determining that there is no more files to load #
###################################################################
def loadMoreFiles(files_to_load, identifier, timeout=60, n_scroll=1):
    for _ in range(n_scroll):  # do this operation(scroll to load) for n times
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        seconds = 0
        current_len = len(files_to_load[0])

        ########################################################################
        # If the first one is loaded then all others will surely get loaded    #
        # so you don't have to check the whole length of 'files_to_load' array #
        # but if you want you can do something like this ->                    #
        # current_len = sum(len(i) for i in files_to_load)                     #
        ########################################################################
        while (current_len >= len(files_to_load[0])) and seconds < timeout:
            time.sleep(1)

            # iterate through all the files that are needed to be loaded
            for idx, _ in enumerate(files_to_load):
                files_to_load[idx].extend(
                    [
                        element
                        for element in driver.find_elements(By.XPATH, identifier[idx])
                        if element not in files_to_load[idx]
                    ]
                )

            seconds += 1

    return files_to_load


def waitToFinishDownload(directory, timeout, nfiles=None):
    """
    Wait for downloads to finish with a specified timeout.

    Args
    ----
    directory : str
        The path to the folder where the files will be downloaded.
    timeout : int
        How many seconds until it stops waiting.
    nfiles : int, defaults to None
        If provided, also wait for the expected number of files.

    """

    print("\nWaiting for {} files to be downloaded\n".format(nfiles))

    seconds = 0
    dl_wait = True

    while dl_wait and seconds < timeout:
        time.sleep(0.5)  # check every 0.5s
        dl_wait = False
        files = os.listdir(directory)

        # if nfiles and len(files) != nfiles:
        # if nfiles and len(files) < nfiles:
        #     dl_wait = True

        for fname in files:
            if fname.endswith(
                ".crdownload"
            ):  # as partial downloaded files will be of ".crdownload" extension for chromium based browsers
                dl_wait = True
                break

        seconds += 0.5

    if seconds >= timeout:
        seconds = -1
        print(
            "Your connection is too slow or you are not connected! Try again later. Closing the connection..."
        )
    else:
        print("Continuing after {}s...\n".format(seconds))

    return seconds


In [None]:
# waitNSeconds(5) # enable this if this script is fully automated for your case
driver.get("https://www.facebook.com/groups/201623576939858/files/")

download_thread_count = (
    3  # this value will ensure how many downloads will happen concurrently
)

wait = WebDriverWait(driver, 10) # to wait until the element is ready -> Explicit Waits

"""
# targets identifiers                                                            #
# this identifiers will change continuously so update it according to your needs #

"""
download_button_cssSelector = "a[href*='https://www.facebook.com/download/']"
fileOption_xpath = "//div[@aria-label='File options']"  # by default finds 15 per-scroll
fileName_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d3f4x2em iv3no6db jq4qci2q a3bd9o3v lrazzd5p oo9gr5id hzawbc8m']"  # find 15 per-scroll
fileTypeDate_xpath = "//span[@class='d2edcug0 hpfvmrgz qv66sw1b c1et5uql lr9zc1uh a8c37x1j fe6kdd0r mau55g9w c8b282yb keod5gw0 nxhoafnm aigsh9s9 d9wwppkn iv3no6db e9vueds3 j5wam9gi b1v8xokw oo9gr5id hzawbc8m']"  # find 30 per-scroll

###########################################################
# NOT NEEDED -> as it doesn't sort by file type reliably 😑
###########################################################
# sortButtons_xpath = "//div[@class='l9j0dhe7 du4w35lb j83agx80 pfnyh3mw taijpn5t bp9cbjyn owycx6da btwxx1t3 kt9q3ron ak7q8e6j isp2s0ed ri5dt5u2 rt8b4zig n8ej3o3l agehan2d sk4xxmp2 rq0escxv d1544ag0 tw6a2znq tdjehn4e tv7at329']" # find 3 per-scroll

# sortButtons = wait.until(
#     EC.presence_of_all_elements_located((By.XPATH, sortButtons_xpath))
# )

# # sort by file_type (PDF at the top in my case)
# driver.execute_script("arguments[0].click();", sortButtons[1])


In [None]:
j, k = 0, 1
registered_files_count = sum(
    1 for _ in open(log_file_location, encoding=encoding)
)  # total files registered in the log file

##################################
# grabbing initial loaded target #
##################################
file_option_buttons = wait.until(
    EC.presence_of_all_elements_located((By.XPATH, fileOption_xpath))
)
file_names = wait.until(EC.presence_of_all_elements_located((By.XPATH, fileName_xpath)))
file_types_and_dates = wait.until(
    EC.presence_of_all_elements_located((By.XPATH, fileTypeDate_xpath))
)

for idx, button in enumerate(file_option_buttons):
    try:
        _name = file_names[idx].text
        _type = file_types_and_dates[j].text
        _date = file_types_and_dates[k].text

        # as 'file_types_and_dates' contains both file-type and date
        j += 2
        k += 2

        # Scroll after 15th button click to load more files (as by default 15 files loads per scroll)
        if (idx + 1) % 15 == 0:
            [file_option_buttons, file_names, file_types_and_dates] = loadMoreFiles(
                [file_option_buttons, file_names, file_types_and_dates],
                [fileOption_xpath, fileName_xpath, fileTypeDate_xpath],
            )

        ############################################################################
        # Download pdf files only                                                  #
        # Cannot check this before the scroll because                              #
        # there is a possibility to have more than one page worth of non-pdf files #
        ############################################################################
        if not _type == target_file_type.upper():
            continue

        ######### TODO -> check this functions
        # isDownloaded()
        # waitToFinishDownload()
        ##############

        ####################################
        # To skip already downloaded files #
        ####################################
        if isDownloaded(_name):
            print(
                "Skipping... {}/{} ---> {}".format(
                    idx + 1, len(file_option_buttons), _name
                )
            )
            continue

        ##########################################################################################################
        # Waiting after every 'download_thread_count' downloads request                                      #
        # we have to wait for the files to be downloaded, if we continue without it there maybe some files       #
        # that couldn't be downloaded on time (before the link expire) and later on those files can't be resumed #
        ##########################################################################################################
        if (idx + 1) % download_thread_count == 0:
            download_time = waitToFinishDownload(
                secrets[2], 600, registered_files_count
            )  # waiting for 600s before closing the connection for slow/no internet

            if download_time == -1:
                break  # Stoping the Bot because of a network failure
        
        #####################################################
        # If everything is OK then initiating file download #
        #####################################################
        driver.execute_script(
            "arguments[0].click();", button
        )  # Clicking the download button
        # wait.until(EC.element_to_be_clickable(button)).click() # this was giving me -> ElementClickInterceptedException

        waitNSeconds(0.8)

        try:
            download_link = wait.until(
                EC.presence_of_element_located(
                    (By.CSS_SELECTOR, download_button_cssSelector)
                )
            )

            driver.execute_script(
                "arguments[0].target='_self';", download_link
            )  # to prevent it from opening into a new tab
            driver.execute_script("arguments[0].click();", download_link)

            registered_files_count += 1  # Keeping track of the downloaded files

        except:
            writeToFile(
                "***** Warning: No download link found at {} *****".format(idx + 1)
            )

        # Updating log
        log = '"{}" --- "{}" --- "{}"'.format(_name, _type, _date)
        writeToFile(log)

        print("{} --- {} - {}".format(_name, _date, len(file_option_buttons)))

        #################################################################
        # waiting at least 2s for safety sake 😅                        #
        # waiting (1.2+0.8+calculation_time) -> 2s+ after each download #
        #################################################################
        waitNSeconds(1.2)

        ############## NOT TESTED YET ####################
        # file_option_buttons.pop(idx)
        # file_names.pop(idx)
        # file_types_and_dates.pop(j)
        # file_types_and_dates.pop(k)
        ############## NOT TESTED YET ####################

    except Exception as e:
        log = "***** ERROR at {}, date: {}: {} *****".format(idx + 1, _date, e)

        print(log)
        writeToFile(log)

# driver.close() # not closing because I had to run this cell multiple times during test
print("\nCompleted. Total scrapped file:", registered_files_count)
