# Text Analysis

<b>by Nwachukwu Anthony</b><br>
<b>Email: nwachukwu.anthony2017@gmail.com</b>

## Import Libraries needed

In [1]:
import math
import string

## Import the Documents
Name the documents by Document0, Document1, Document2 and so On

In [2]:
pages = []
n = 3 # No of documents
for i in range(n):
    pages.append('Document'+str(i)+'.txt')
#print(pages)   #Uncomment this to see the list of your documents

## Preprocess Document
Cleans the Documents when called, removes the endlines and removes punctuations

In [3]:
def documentPreClean(document):
    #Convert line breaks to space hence changing the document to a line string
    with open(document, 'r') as myfile:
        data=myfile.read().replace('\n', ' ')
    #Remove punctuations
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
    return(data.translate(translator))

## Preprocess Document (part 2)
The characters or strings that persists after the first state, handle them and remove them here <br>
<b><u>Example</u></b> <br><b>removeForbiddenCharacters('Document.txt',['\t','@'])</b>

In [4]:
def removeForbiddenCharacters(document,forbidden):
    #Remove unwanted strings not captured above
    page = documentPreClean(document)
    for item in forbidden:
        page = page.replace(item,' ')
    splitedPage = page.split(' ')
    #Remove the empty string found in the document list
    while '' in splitedPage:
        splitedPage.remove('')
    #Remove white spaces from words
    for i in range(len(splitedPage)):
        splitedPage[i] = splitedPage[i].strip()
    return([w.lower() for w in splitedPage])

## Create a dictionary from the words in the document

In [5]:
def createDictionary(page,forbidden):
    listOfWords = removeForbiddenCharacters(page,forbidden)
    key = set(listOfWords)
    dic = dict.fromkeys(key,0 )
    for word in listOfWords:
        dic[word] += 1
    return(dic)

## Determine the term frequency of word i in document j

### $TF_{ij} = \frac{f_{ij}}{max_k f_{kj}}$

In [6]:
def findWordsFrequency(page,forbidden):
    wordList = createDictionary(page,forbidden)
    wordListMax = max(wordList.values())
    for word in wordList.keys():
        wordList[word] = [wordList[word],wordList[word]/wordListMax]
    return(wordList)

## Compute TF, IDE and TF.IDE for each word in each document and do the formatting

In [7]:
def computeTF_IDE(documents,forbiddenStrings):
    newDic = dict()
    final = dict()

    #Compute the TF of each word in EACH document
    for document in documents:
        freqWords = findWordsFrequency(document,forbiddenStrings)
        for word in freqWords.keys():
            newDic[word] = 0
    for document in documents:
        freqWords = findWordsFrequency(document,forbiddenStrings)
        for word in freqWords.keys():
            newDic[word] += 1

    #Compute the IDE of each word in the WHOLE documents
    for word in newDic:
        newDic[word] = math.log(len(documents)/newDic[word],2)

    print()
    i = 0
    for document in documents:
        i += 1
        print('Document No: '+str(i))
        freqWords = findWordsFrequency(document,forbiddenStrings)
        print("{:<40} {:<12} {:<10} {:<8} {:<8}".format('Word','Word_Count','TF','IDF','TF.IDF'))
        print('-'*80)
        for word in freqWords.keys():
            freqWords[word].extend([newDic[word],freqWords[word][1]*newDic[word]])
            print("{:<40} {:<12} {:<10.3f} {:<8.3f} {:<8.3f}".format(word,freqWords[word][0],freqWords[word][1],freqWords[word][2],freqWords[word][3]))
        print()
        print()
        #print(freqWords)
    return

## Run the function

In [8]:
computeTF_IDE(pages,['\t'])


Document No: 1
Word                                     Word_Count   TF         IDF      TF.IDF  
--------------------------------------------------------------------------------
great                                    1            0.006      1.585    0.010   
predictions                              1            0.006      0.000    0.000   
input                                    23           0.147      0.000    0.000   
inputs                                   7            0.045      1.585    0.071   
been                                     3            0.019      0.585    0.011   
determined                               2            0.013      1.585    0.020   
google                                   2            0.013      0.585    0.007   
terms                                    1            0.006      1.585    0.010   
prediction                               6            0.038      0.000    0.000   
differentiates                           1            0.006      1.585   

artificial                               4            0.026      0.000    0.000   
calculated                               2            0.013      1.585    0.020   
empirical                                3            0.019      0.585    0.011   
paper                                    3            0.019      0.585    0.011   
motion                                   14           0.090      0.000    0.000   
work                                     3            0.019      0.000    0.000   
does                                     2            0.013      1.585    0.020   
applied                                  1            0.006      0.585    0.004   


Document No: 2
Word                                     Word_Count   TF         IDF      TF.IDF  
--------------------------------------------------------------------------------
toward                                   1            0.077      1.585    0.122   
making                                   2            0.154      0.585  