In [5]:
import sys

In [6]:
import os

In [7]:
import glob

In [8]:
from collections import Counter

In [9]:
# Returns file names, directory names, & directory locations
def getFileAndDirectoryNames(location = os.getcwd()):
    path = location
    fileArray = []
    directoryArray = []
    directoryLocations = []

    for file in next(os.walk(path))[2]:
        fileArray.append(file)

    for directory in next(os.walk(path))[1]:
        directoryArray.append(directory)
        directoryLocations.append(path+"//"+directory)

    print("FILES----------------------")
    for file in fileArray:
        print(file)

    print()
    print("DIRECTORIES----------------")
    for directory in directoryArray:
        print(directory)
    print("---------------------------------------------------------------------")
        
    return (fileArray, directoryArray, directoryLocations)

In [10]:
# Retrieves Sub Directory Information(Files and Directories)
def getSubDirectoryFilesAndDirectories(directoryLocations):
    for location in directoryLocations:
        print(location)
        getFileAndDirectoryNames(location)

In [11]:
# Returns a List of File Types
def getFileTypes(fileArray):
    fileTypes = []
    for file in fileArray:
        extension = file.split(".")[1]
        fileTypes.append(extension)
    return fileTypes

In [27]:
# Returns a Count of Each File Type as a Counter Object
def getFileCounts(fileTypes):
    return Counter(fileTypes)

In [12]:
# Use this Method to read a File Line-By-Line
def readFile_LineByLine(fileName):    
    with open(fileName, 'r') as file:
        for line in file:
            print(line)

In [13]:
# Recursively call the above Method on a List of Files
def readFiles(fileList):
    for file in fileList:
        readFileLineByLine(file)

In [14]:
# Open File, Write 'text' and Close
def openWriteClose(fileName, text):
    with open(fileName, 'w') as file:
        file.write(text)

In [15]:
# Open File, Append, & Close
def openAppendClose(fileName, text):
    with open(fileName, 'a') as file:
        file.write(text)

In [16]:
# Open and Read all Lines from a File. NOTE: Returns a list.
def openReadAll(fileName):
    with open(fileName, 'r') as file:
        return file.readlines()

In [17]:
# Construct a Single String from all Lines
def appendAllLinesInFile(Lines):
    allLines = ''
    for line in Lines:
        allLines += line.rstrip() +" "
    return allLines.rstrip()

In [18]:
# Get Word Frequency
def fileWordFrequency(text):
    wordList = text.split(" ")
    return Counter(wordList)

In [19]:
# Get the Number of Lines in a File 
def fileCountLines(fileName):
    lines = 0
    with open(fileName, 'r') as file:
        for line in file:
            lines += 1
    print(lines)

In [20]:
# Get File Information: Creation Time, Modification Time, File Owner Information, File Size, Author ID and Group ID
def getCreationModificationSizeOwnershipInfo(file):
    creation_time = time.ctime(file.st_ctime)
    modification_time = time.ctime(file.st_mtime)
    fileSize = file.st_size
    authorID = file.st_uid
    groupID = file.st_gid
    
    print("User ID of file creator: "+str(authorID))
    print("Group ID of user: "+str(groupID))
    print("File size is: "+str(fileSize) + " bytes")
    print("File was created at:       "+creation_time)
    print("File was last modified at: "+modification_time)
    
    creation_time = datetime.datetime.strptime(creationTime, "%a %b %d %H:%M:%S %Y")
    modification_Time = datetime.datetime.strptime(modificationTime, "%a %b %d %H:%M:%S %Y")
    
    timeDifferenceFromCreation = modification_Time - creation_time
    print("The file was modified "+str(timeDifferenceFromCreation.seconds)+" seconds after creation.")
    timeDifferenceFromNow = datetime.datetime.now() - modification_Time
    print("The file was modified "+str(round((timeDifferenceFromNow.seconds/60)/60))+" hours ago from now.")

In [37]:
# time1 & time2 should be like st_ctime or st_mtime
def getDuration(time1, time2):
    time1 = datetime.datetime.strptime(time1, "%a %b %d %H:%M:%S %Y")
    time2 = datetime.datetime.strptime(time2, "%a %b %d %H:%M:%S %Y")
    
    timeDict = {}
    
    duration = time2 - time1
    seconds = duration.seconds
    minutes = seconds / 60
    hours = minutes / 60
    days = hours / 24
    months = days / 30
    years = months / 365
    
    timeDict['seconds'] = seconds
    timeDict['minutes'] = minutes
    timeDict['hours'] = hours
    timeDict['days'] = days
    timeDict['months'] = months
    timeDict['years'] = years
    
    
    return timeDict

In [21]:
# Split Line on Delimiters
def splitSentence(delimiters, line):
    return re.split(delimiters, line)

In [22]:
# Find Words in a Line
def findWords(line):
    return re.findall(r"[\w']+", line)

In [23]:
# Get Punctuation Marks in a Line
def getPunctuations(line):
    from collections import Counter
    punctuation = [',',';','.','!','?','-',':']
    punctuationList = [(ch, line.index(ch)) for ch in line if ch in punctuation]
    counterObj = Counter(punctuationList)
    return counterObj.most_common()

In [1]:
# Returns a str
def getTextFromFile(file):
    with open(file, 'r') as document:
        return str(document.readlines())

In [2]:
# Returns a list of sentences
def getSentencesFromText(text):
    sentences = []
    for sentence in sent_tokenize(text):
        sentences.append(sentence)
    return sentences

In [3]:
# Returns words and words without stopwords
def getWordsFromSentences(allSentences):
    words = []
    for sentence in allSentences:
        for word in word_tokenize(sentence):
            words.append(word)
    print(words, len(words))
    wordsFiltered = []
    for word in words:
        if word not in stopwords.words():
            wordsFiltered.append(word)
    print(wordsFiltered, len(wordsFiltered))
    return (words, wordsFiltered)

In [1]:
# Get Part-of-Speech for each Word and a Frequency of Each Part-of-Speech in the Sentence.
def getPartOfSpeech(line):
    import nltk
    from collections import Counter
    posList = []
    for pos in nltk.pos_tag(nltk.word_tokenize(line)):
        posList.append(pos)
    return(posList, Counter(posList))

In [2]:
# Get Part-of-Speech Information for a file as a dictionary. Key is sentence index, and value is a list of POS information.
def getPartOfSpeechInfoForFile(file):
    with open(file, 'r') as document:
        text = document.read()
        filePOSInfo = {}
        for index, sentence in enumerate(nltk.sent_tokenize(text)):
            sentencePOSInfo = []
            for pos in nltk.pos_tag(nltk.word_tokenize(str(sentence))):
                sentencePOSInfo.append(pos)
            filePOSInfo[index] = sentencePOSInfo
        return(filePOSInfo)

In [3]:
# Aggregate File Part-of-Speech Information i.e. frequency of each POS in each line in each sentence of the file.
def aggregateFilePOSInfo(filePOSDict):
    from collections import Counter
    for k,v in filePOSDict.items():
        filePOSDict[k] = Counter(v)
    return filePOSDict

In [None]:
# Get POS Information 