In [None]:
import re
import os
import shutil
import sys
import hashlib
from datetime import datetime as dt

In [None]:
path = '/path/to/waybackmachine/downloads'

In [None]:
outputpath='/path/to/store/newly/organised/folders'

# Organise wayback machine snapshots into folders corresponding to the month they are from

In [None]:
def copy_files_monthly(folder, not_copied = None, not_kept = None):
    if not_copied is None:
        not_copied=[]
    if not_kept is None:
        not_kept=[]
    fls=os.listdir(os.path.join(path,folder)) # list all files in folder
    snaps=[item[:6] for item in fls if re.match('^\d',item)]
    snaps=list(dict.fromkeys(snaps))
    snaps.sort()
    for snap in snaps: # organise snapshots into folders corresponding to the month they are from
        month=[item for item in os.listdir(os.path.join(path,folder)) if item.startswith(snap)]
        base=os.path.basename(folder)
        path_out=os.path.join(outputpath,base,snap)
        for entry in month:
            path_new=os.path.join(path,folder,entry)
            for root, dirs, files in os.walk(path_new):
                
                if len(files)>0: # filter snapshots so that only relevant text data is retained
                    files_tokeep=[item for item in files if '.htm' in item or '.php' in item or item.endswith('.html') or item.endswith('.htm')]
                    files_tokeep2=[item for item in files_tokeep if item not in ['robots.txt', 'error.html']]
                    not_keep=[item for item in files if item not in files_tokeep2]
                    not_keep_txt=[item for item in not_keep if '.htm' in item or '.txt' in item]
                    not_keep=[item.split('.')[-1] for item in not_keep]
                    
                    # make reference of what was not kept
                    for pst in list(dict.fromkeys(not_keep)):
                        not_copied.append(pst)
                    for nk in not_keep_txt:
                        not_kept.append(nk)
                    
                    # reassign files to be copied from old filepath to new filepath
                    if len(files_tokeep2)>0:
                        for file in files_tokeep2:
                            old_name = os.path.join( os.path.abspath(root), file )
                            base, extension = os.path.splitext(file)
                            new_name = os.path.join(path_out, file)
                            
                            # check if output folder exists and if not then create it
                            if not os.path.exists(path_out):
                                os.makedirs(path_out)
                            
                            # copy file
                            if not os.path.exists(new_name): 
                                shutil.copy(root+'/'+file,new_name)
                            else:
                                ii = 1
                                
                                while True:
                                    new_name = os.path.join(path_out, base + "_" + str(ii) + extension)
                                    if not os.path.exists(new_name):
                                        shutil.copy(old_name, new_name)
                                        break
                                    ii += 1
    print('File extensions not copied:',list(dict.fromkeys(not_copied)),
          '\n.Txt files not copied:',list(dict.fromkeys(not_kept)),'*',len(not_kept))


In [None]:
for folder in os.listdir(path):
    copy_files_monthly(folder)

In [None]:
c=0
for folder in os.listdir(path):
    c+=1
    print('_'*20)
    print(c)
    print(folder)
    print('_'*20)
    fldr=os.path.join(outputpath,folder)
    if os.path.exists(fldr):
        print('Folder has already been copied \n')
        continue
    tm=dt.now().strftime('%m/%d %H:%M:%S')
    print('Copying files from %s ....' % folder)
    print('Copying started at %s \n' % tm)
    copy_files_monthly(folder)

# Find and remove duplicate files in monthly folders

In [None]:
## The following functions were found at:
## https://stackoverflow.com/questions/748675/finding-duplicate-files-and-removing-them

def chunk_reader(fobj, chunk_size=1024):
    """Generator that reads a file in chunks of bytes"""
    while True:
        chunk = fobj.read(chunk_size)
        if not chunk:
            return
        yield chunk


def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
    hashobj = hash()
    file_object = open(filename, 'rb')

    if first_chunk_only:
        hashobj.update(file_object.read(1024))
    else:
        for chunk in chunk_reader(file_object):
            hashobj.update(chunk)
    hashed = hashobj.digest()

    file_object.close()
    return hashed


def check_for_duplicates(paths, hash=hashlib.sha1, filenames = None, dups = None):
    hashes_by_size = {}
    hashes_on_1k = {}
    hashes_full = {}
    if filenames is None:
        filenames=[]
    if dups is None:
        dups=[]

    for path in paths:
        for dirpath, dirnames, filenames in os.walk(path):
            for filename in filenames:
                full_path = os.path.join(dirpath, filename)
                try:
                    # if the target is a symlink (soft one), this will 
                    # dereference it - change the value to the actual target file
                    full_path = os.path.realpath(full_path)
                    file_size = os.path.getsize(full_path)
                except (OSError,):
                    # not accessible (permissions, etc) - pass on
                    continue

                duplicate = hashes_by_size.get(file_size)

                if duplicate:
                    hashes_by_size[file_size].append(full_path)
                else:
                    hashes_by_size[file_size] = []  # create the list for this file size
                    hashes_by_size[file_size].append(full_path)

    # For all files with the same file size, get their hash on the 1st 1024 bytes
    for __, files in hashes_by_size.items():
        if len(files) < 2:
            continue    # this file size is unique, no need to spend cpy cycles on it

        for filename in files:
            try:
                small_hash = get_hash(filename, first_chunk_only=True)
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue

            duplicate = hashes_on_1k.get(small_hash)
            if duplicate:
                hashes_on_1k[small_hash].append(filename)
            else:
                hashes_on_1k[small_hash] = []          # create the list for this 1k hash
                hashes_on_1k[small_hash].append(filename)

    # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
    for __, files in hashes_on_1k.items():
        if len(files) < 2:
            continue    # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it

        for filename in files:
            try: 
                full_hash = get_hash(filename, first_chunk_only=False)
            except (OSError,):
                # the file access might've changed till the exec point got here 
                continue

            duplicate = hashes_full.get(full_hash)
            if duplicate:
                #print("Duplicate found: %s and %s" % (filename, duplicate))
                filenames.append(filename)
                dups.append(duplicate)
            else:
                hashes_full[full_hash] = filename
    
    return filenames,dups



In [None]:
# find and remove duplicate files
for item in os.listdir(outputpath):
    to_remove=[0]
    duppath=os.path.join(outputpath,item)
    print(duppath)

    cnt=0
    nof=0
    while len(to_remove)>0:
        cnt+=1
        lists=check_for_duplicates([duppath])
        orig_filenames=list(dict.fromkeys(lists[0]))
        duplicates=list(dict.fromkeys(lists[1]))
        to_remove=[item for item in duplicates if item not in orig_filenames]
        nof=nof+len(to_remove)
        print('Iteration:',cnt)
        print('Total number of files removed:',nof)
        for item in to_remove:
            os.remove(item)
            
    # delete empty folders
    for root, direc, file in os.walk(duppath):
        for item in direc:
            dirpath=os.path.join(root, item)
            if len(os.listdir(dirpath))==0:
                print(item)
                os.rmdir(dirpath)