In [None]:
#default_exp tfidf

In [None]:
#export
from collections import Counter
import numpy as np
import math
from fastcore.test import test_eq
from nbdev.showdoc import *

In [None]:
#export
def get_freq(preprocessed_documents):
    
    "Returns list with vocabulary frequencies per document and a vocabalury list"
    
    document_frequency = []
    vocab = []

    for doc in preprocessed_documents:
        document_frequency.append(Counter(doc))
        vocab = vocab + doc
    
    vocab = list(set(vocab))
    return document_frequency, vocab

show_doc(get_freq)

<h4 id="get_freq" class="doc_header"><code>get_freq</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_freq</code>(**`preprocessed_documents`**)

Returns list with vocabulary frequencies per document and a vocabalury list

In [None]:
#export
def form_matrix(doc_freq, vocabulary):

    "Returns matrix with td-idf vectors."
    
    M = []
    
    for doc in doc_freq:
        arr = np.zeros(len(vocabulary))

        for word in doc.keys():
            tf = doc[word] / (sum(doc.values()))
            freq = 0
            for doc1 in doc_freq:
                if word in doc1.keys():
                    freq+=1
    
            idf = math.log(len(doc_freq)/(freq+1))
            tfidf = tf * idf
            tfidf_arr = np.array([tfidf])
            index = vocabulary.index(word) 
            np.put(arr, index, tfidf_arr)

        M.append(arr)
    return M

show_doc(form_matrix)

<h4 id="form_matrix" class="doc_header"><code>form_matrix</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>form_matrix</code>(**`doc_freq`**, **`vocabulary`**)

Returns matrix with td-idf vectors.

In [None]:
#export
def get_query_vec(preprocessed_query, vocab, doc_freq):

    "Retun tf-idf vector of input query"
    
    
    counter = Counter(preprocessed_query)
    vector = np.zeros(len(vocab))

    for word in preprocessed_query:

        tf = counter[word] * sum(counter.values())
        freq = 0
        for doc in doc_freq:
            if word in doc.keys():
                freq+=1
        idf = math.log(len(doc_freq)/ (freq+1))
        tfidf = tf * idf
        tfidf_arr = np.array([tfidf])        
        if word in vocab:
            index = vocab.index(word) 
            np.put(vector, index, tfidf_arr)
    
    return vector  

In [None]:
#export
def get_cos_sim(matrix, vector):
    
    "Returns 10 most similar documents based on cosine similarity between documents and query vector"
    
    cos_sim = []
    for vec in matrix:
        cos = np.dot(vec, vector) / (np.linalg.norm(vec) * np.linalg.norm(vector))
        cos_sim.append(cos)
    
    array = np.array(cos_sim)
    sort_index = np.argsort(array)[::-1][:10]
    return sort_index

In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted 00_preprocess.ipynb.
Converted 01_tfidf.ipynb.
Converted index.ipynb.
