# File Counting

In [36]:
import os

def getFileExtensions(cwd, ignoreExts = []):
    '''returns a list of all file extensions in a working directory, cwd,
    including the extensions contained in sub-directories within the cwd.
    Does not include extensions in ignoreExts'''
    allFiles = os.walk(cwd)
    exts = []
    for walk_output in allFiles:
        for file_name in walk_output[-1]:
            ext = file_name.split(".")[-1]
            if ext not in ignoreExts:
                exts.append(ext)
    return(exts)

def getExtensionCounts(cwd, ignoreExts = []):
    '''returns a dictionary of all file extensions and their occurrences
    in a working directory, cwd, and all the sub-directories within it.
    Does not count extensions in ignoreExts.'''
    d = {}
    allFileExtensions = getFileExtensions(cwd, ignoreExts)
    for key in allFileExtensions:
        if key not in d and key not in ignoreExts:
            d[key] = allFileExtensions.count(key)
    return d

def getFolderName(folderLoc):
    '''returns the folder name from a folder location'''
    return folderLoc.split("/")[-1]

def getDiscrepancies(folderLoc, refDict, ignoreExts = []):
    '''returns a string describing all the file extension count discrepancies that
    exist in a folder (at folderLoc), and all the sub-directories within that
    folder, and a reference dictionary (refDict) of file extensions (as from 
    getExtensionCounts). NOTE that this will only check if all the files and
    their counts in the refDict are present (with the correct counts) in folderLoc;
    extra files in folderLoc with file extensions not in the refDict that are 
    not present in the dictionary will not be reported. This allows analysis files
    to be added to a folder after files have been generated without causing errors
    at this level.'''
    
    s = ''
    fileExtensions = getFileExtensions(folderLoc, ignoreExts)

    for ext in refDict.keys():
        refCount = refDict[ext]
        folderCount = fileExtensions.count(ext)
        if refCount != folderCount:
            problemFolder = getFolderName(folderLoc)
            s += ('%s: Discrepancy in *.%s files: %i [FOLDER] vs %i [REF]\n' %(problemFolder, ext, folderCount, refCount))
    return s