With reference to : https://www.askpython.com/python/examples/tf-idf-model-from-scratch


## Preprocess the Data


In [10]:
import numpy as np
from nltk.tokenize import word_tokenize

In [11]:
#Example text corpus for our tutorial
text = ['Topic sentences are similar to mini thesis statements.\
        Like a thesis statement, a topic sentence has a specific \
        main point. Whereas the thesis is the main point of the essay',\
        'the topic sentence is the main point of the paragraph.\
        Like the thesis statement, a topic sentence has a unifying function. \
        But a thesis statement or topic sentence alone doesn’t guarantee unity.', \
        'An essay is unified if all the paragraphs relate to the thesis,\
        whereas a paragraph is unified if all the sentences relate to the topic sentence.']
 

Preprocessing the Text data

In [12]:
sentences = []
word_set = []

for sent in text:
    x = [i.lower() for i in word_tokenize(sent) if i.isalpha()]
    sentences.append(x)
    for word in x:
        if word not in word_set:
            word_set.append(word)
# set of vocab
word_set = set(word_set)
# Total documents in out corpu
total_documents = len(sentences)

# Creating an index for each word in our vocan
index_dict = {} # Dictionary to store index for each word
i = 0
for word in word_set:
    index_dict[word] = i
    i += 1

## Create a dictionary for keeping count

In [13]:
# We can create a dictionary to keep the ocunt of the number of documents containing the given word

def count_dict(sentences):
    word_count = {}
    for word in word_set:
        word_count[word] = 0
        for sent in sentences:
            if word in sent:
                word_count[word] += 1
    return word_count

## Define a function to calculate the Term Frequency (TF)

In [14]:
def termFrequency(document, word):
    N = len(document)
    occurence = len([token for token in document if token == word])
    return occurence / N

## Define a function to calculate Inverse Document Frequency (IDF)

In [15]:
def inverse_doc_freq(word):
    try: 
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_documents / word_occurance)

## Combining TF-IDF functions

In [16]:
def tf_idf(sentence):
    tf_idf_vec = np.zeros((len(word_set), ))
    for word in sentence:
        tf = termFrequency(sentence, word)
        idf = inverse_doc_freq(word)

        value = tf * idf
        tf_idf_vec[index_dict[word]] = value
    return tf_idf_vec

## Apply the TF-IDF model to text

In [17]:
vectors = []
for sent in sentences:
    vec = tf_idf(sent)
    vectors.append(vec)
print(vectors[0])

[0.         0.03662041 0.03662041 0.         0.         0.03662041
 0.         0.10986123 0.         0.         0.         0.07324082
 0.10986123 0.07324082 0.03662041 0.03662041 0.         0.03662041
 0.03662041 0.         0.         0.03662041 0.07324082 0.
 0.         0.03662041 0.03662041 0.03662041 0.         0.03662041
 0.03662041 0.10986123 0.         0.         0.03662041 0.03662041
 0.        ]
