In [69]:
import numpy as np
import pandas as pd

## Load the documents

In [70]:
docA = "apple apple apple apple a banana an banana "
docB = "apple car a car" 

## Preprocessing

### Represent each document as 'Bag of words' (bow)

### Tokenize

In [71]:
# for simplicity, we will tokenize our documents assuming everything between spaces is a word 
# in reality you may want to use nltk.word_tokenize()
bowA = docA.split(" ")
bowB = docB.split(" ")
bowA

['apple', 'apple', 'apple', 'apple', 'a', 'banana', 'an', 'banana', '']

In [72]:
# remove empty strings, resulting from split
# bowA = list(filter( lambda w: w, bowA))
# bowB = list(filter( lambda w: w, bowB))

# TASK: remove words which contain <= 2 symbol
bowA = list(filter( lambda w: len(w)>2 , bowA))
bowB = list(filter( lambda w: len(w)>2, bowB))


In [73]:
print(f'Bag of words for docA:\n{bowA}\n')
print(f'Bag of words for docB:\n{bowB}')

Bag of words for docA:
['apple', 'apple', 'apple', 'apple', 'banana', 'banana']

Bag of words for docB:
['apple', 'car', 'car']


### stemming

'apple' and 'apples' should be counted as a same term. Stemming is helping that process.
Resource: https://pythonspot.com/nltk-stemming/


### Remove the 'stop words'

Prepositions, articles and other common words are considered as useless (low lexical content) in document representation, thus you can filter the "Bag of Words" through a common stop-word list, or create custom specific.
NLTK module has a list of stopewrods for many languages

## Create the collection vocabulary

Make one list (set) of unique words for document collection

That set will represent the vocabulary of our documents collection



In [74]:
# a note about set() data structure
l = [1,2,1,2,1,3,4]
s = set(l)
s

{1, 2, 3, 4}

In [75]:
vocabulary = set(bowA).union(set(bowB))

In [76]:
print(f'vocabulary: {vocabulary}')
print(f'bowA: {bowA}')
print(f'bowB: {bowB}')

vocabulary: {'banana', 'apple', 'car'}
bowA: ['apple', 'apple', 'apple', 'apple', 'banana', 'banana']
bowB: ['apple', 'car', 'car']


## Count unique words in each document, i.e. Term Frequency

Term frequency indicates the significance of a particular term in the document

Now, we have to represent each document as numbers of terms occurrence in the vocabulary

In [77]:
TFa = dict.fromkeys(vocabulary, 0) 
TFb = dict.fromkeys(vocabulary, 0) 
print(f'TFa:{TFa}')

TFa:{'banana': 0, 'apple': 0, 'car': 0}


In [78]:
### using the fact, that keys are unique values:
# for word in bowA:
#     TFa[word]+=1
    
# for word in bowB:
#     TFb[word]+=1
    
    
### using Python count method - the Pythonic way:
for word in vocabulary:
    TFa[word] = bowA.count(word)
    TFb[word] = bowB.count(word)

print(f'countsA: {TFa}')
print(f'countsB: {TFb}')

countsA: {'banana': 2, 'apple': 4, 'car': 0}
countsB: {'banana': 0, 'apple': 1, 'car': 2}


## Create DF to store document colection

In [79]:
counts_df = pd.DataFrame([TFa, TFb])
counts_df

Unnamed: 0,banana,apple,car
0,2,4,0
1,0,1,2


## Calculate adjusted TF

adjusted term frequency for document = counts_term / (number of terms in d)

In [80]:
# create new df for the TF
TF_df = counts_df.copy()

for i, row in TF_df.iterrows(): 
    total = row.sum()            
    TF_df.iloc[i] = row/total
    print(f'total: {total}')
     
     
TF_df

total: 6
total: 3


Unnamed: 0,banana,apple,car
0,0.333333,0.666667,0.0
1,0.0,0.333333,0.666667


## Compute IDF

The inverse document frequency is a measure of how much information the word provides.

IDF(t) = log(Total number of documents / Number of documents with term i in it).

\begin{equation*}
{IDF}(i, D) =  \log \frac{N}{Ni}
\end{equation*}




In [81]:
# create the IDF_df
IDF_df = counts_df.copy()
IDF_df

Unnamed: 0,banana,apple,car
0,2,4,0
1,0,1,2


In [82]:
N = len(IDF_df)
print(f'Number of documents N = {(N)}')

N_per_term = IDF_df.astype(bool).sum(axis=0)
print(N_per_term)  

Number of documents N = 2
banana    1
apple     2
car       1
dtype: int64


In [83]:
TF_df

Unnamed: 0,banana,apple,car
0,0.333333,0.666667,0.0
1,0.0,0.333333,0.666667


In [84]:

for i, row in counts_df.iterrows():     
    IDF_df.iloc[i] = np.log(N/N_per_term)

In [85]:
IDF_df

Unnamed: 0,banana,apple,car
0,0.693147,0.0,0.693147
1,0.693147,0.0,0.693147


In [86]:
IDF_df

Unnamed: 0,banana,apple,car
0,0.693147,0.0,0.693147
1,0.693147,0.0,0.693147


## TF-IDF

Term frequency–Inverse document frequency.

A high weight in tf–idf is reached by a high term frequency (in the given document) and a low document frequency of the term in the whole collection of documents; the weights hence tend to filter out common terms. 

TF_IDF = TF*IDF


In [87]:
# TF_IDF = counts_df.copy()
TF_IDF = TF_df * IDF_df

TF_IDF

Unnamed: 0,banana,apple,car
0,0.231049,0.0,0.0
1,0.0,0.0,0.462098
