In [9]:
import pandas as pd
import numpy as np
from operator import itemgetter 
import collections
import itertools
import time
import pickle

# Import preprocessed datasets
Useful ref.:
- https://stackoverflow.com/questions/3294889/iterating-over-dictionaries-using-for-loops

In [10]:
df_train = pd.read_csv('train_docs.csv')

In [11]:
N=len(df_train)

In [12]:
#df_train

In [1]:
df_train.head()

# Index creation
- dictionary: corpus vocabulary from all documents
- input: dictionary, all documents
- logic: iterate over the each documents text, update dictionary for each token, keep track of tockens already updated
- output: dictionary (k,v) where k= word, v= list of documents the term is in

## Functions

In [14]:
from collections import defaultdict

In [15]:
def build_index(docs_df):
    out = defaultdict(list)
    for i in range(len(docs_df)):
        existing = set()
        d_id=docs_df['id'][i]
        d_text=docs_df['text'][i]
        for w in str(d_text).split():
            if w not in existing:
                out[w].append(d_id)
                existing.add(w)
    return out

In [16]:
def get_docs_count(index, term):
    return len(index[term])

## Get index
- output is a dictionary: 
- key: the term
- value: list of the documents in which the term appears

In [17]:
index= build_index(df_train)

In [2]:
get_docs_count(index, 'statin')

In [3]:
len(index)

In [20]:
pickle.dump( index, open( "index.p", "wb" ) )

In [21]:
index_array=[]
for k in index:
    index_array.append(k)

In [4]:
len(index)

In [5]:
len(index_array)

# Tf

In [101]:
#df_train['id'][0]
#df_train['text'][0]

In [26]:
def raw_frequency(term, doc):   
    count = 0
    
    if isinstance(doc, str):
        for word in doc.split():
            if term == word:
                count = count + 1
       
    return count

In [28]:
#raw_frequency('statin', df_train['text'][0])

In [29]:
from collections import Counter

In [30]:
def get_most_freq_term(doc):
    
    doc_freq = dict([word, raw_frequency(word, doc)] for word in doc.split()) 
    value, count = Counter(doc_freq).most_common(1)[0]    
    
    return count

In [31]:
#get_most_freq_term(df_train['text'][0])

In [32]:
def compute_tf(term, doc):
    if (raw_frequency(term, doc) > 0) :
        return (1+np.log10(raw_frequency(term,doc)))/(1+np.log10(get_most_freq_term(doc)))
    else:
        return 0

In [33]:
#compute_tf('statin', df_train['text'][0])
#compute_tf('cancer', df_train['text'][0])
#compute_tf('breast', df_train['text'][0])

## Idf

In [34]:
def compute_idf(term, N):
    return np.log10(N/get_docs_count(index, term))

In [36]:
#compute_idf('statin', N)

## Tf-Idf

In [37]:
##check if the term is contained in the document
def isInDoc(term, doc, docID):
    for i in range(len(index[term])):
        if docID==index[term][i]:
            return True
    return False

In [38]:
#print(isInDoc('hahaha', df_train['text'][0], df_train['id'][0]))
#print(isInDoc('statin', df_train['text'][0], df_train['id'][0]))

In [39]:
def compute_tf_idf(term, doc, docID):
    if(isInDoc(term, doc, docID)==False):
        return 0
    else:
        return compute_tf(term, doc)*compute_idf(term, N)

In [40]:
#compute_tf_idf('statin', df_train['text'][0], df_train['id'][0])
#compute_tf_idf('hahaha', df_train['text'][0], df_train['id'][0])

# Term weights matrix
- size: [N, len(index)]
- rows: documents
- columns: term weights (tf-idf)

In [41]:
# testing with a dictionary data structure
def compute_term_weights_matrix(documents, vocab_index):
    out = defaultdict(list)
    for index, row in documents.iterrows():
        doc_id, doc_text= row['id'], row['text']   
        for w in vocab_index:
            tf_idf_score=compute_tf_idf(w, doc_text, doc_id)
            out[doc_id].append(tf_idf_score)
    return out

In [43]:
#row=[] 
#row.append(2.343)

In [6]:
start = time.time()

matrix_whole=compute_term_weights_matrix(df_train, index)

end = time.time()
print(end - start)

In [7]:
len(matrix_whole)

In [8]:
len(df_train)

In [9]:
pickle.dump( matrix_whole, open( "saved_weghts_matrix.p", "wb" ) )

In [49]:
matrix_test = pickle.load( open( "saved_weghts_matrix.p", "rb" ) )

In [10]:
matrix_test

# Vectorization

In [52]:
def vectorize(query, index):
    vec=[]  
    for word in index:
        if word in query:
            vec.append(1)
        else:
            vec.append(0)
    return vec

# Vector and Matrix Normalization

In [53]:
def get_euclidean_norm(vector):
    vector_sum=0
    for v in vector:
        if v!=0:
            vector_sum=vector_sum+np.square(v)        
        
    return np.sqrt(vector_sum)

In [54]:
def normalize_vector(vector):
    euclid_norm= get_euclidean_norm(vector)
    normalized_vector= []
    for v in vector:
        new_v= v/euclid_norm
        normalized_vector.append(new_v)
    return normalized_vector

In [55]:
#def normalize_matrix(matrix):
    #for key, value in matrix.items():
     #   matrix[key]= normalize_vector(value)
    #return matrix

# Euclidean Distance and Cosine

In [11]:
import cosine_similarity as cosine
import euclidean_distance as euclid

## Euclidean Distance
### Euclidean distance between normalized vectors

In [58]:
def compute_euclidean_distance(vector1, vector2):
    sum = 0
    for i in range(len(vector1)):
        sum = sum + (np.square(vector1[i]-vector2[i]))
        
    return np.sqrt(sum)

## Cosine Distance
### Cosine distance (with normalized vectors)

In [59]:
def compute_cosine_distance_normalized(vector1, vector2):
    
    distance = np.square(compute_euclidean_distance(vector1, vector2))
    
    return distance/2

### Cosine distance (with unnormalized vectors)¶

In [60]:
def dot_product(vector1, vector2):
    dot_product = 0;
    for i in range(len(vector1)):
        dot_product = dot_product + vector1[i]*vector2[i]
    
    return dot_product

In [61]:
def compute_cosine_similarity(vector1, vector2):
    return dot_product(vector1, vector2)/(get_euclidean_norm(vector1)*get_euclidean_norm(vector2))

In [62]:
def compute_cosine_distance_unnormalized(vector1, vector2):
    return 1-compute_cosine_similarity(vector1, vector2)

## Most similar documents 

In [160]:
def get_most_similar_docs(query, term_weights_matrix):
    
    ##For every term in the query, retrieve the docID of only those docs that contain the query term
    relevant_docs = set()
    for term in query.split():
        relevant_docs_all=tuple(index[term])
        for doc in relevant_docs_all:
            if doc in term_weights_matrix:
                relevant_docs.add(doc)
       
                       
    #Vectorize the query
    vectorized_query = vectorize(query, index)
    
    ##Normalize the query vector
    normalized_query = normalize_vector(vectorized_query)
    
    ##Dictionary to store the distance between the query and each document that contains the query terms 
    ##key: docID; value: distance
    
    distance_dict = defaultdict(list) 

    ##Compute euclidean distance between the normalized query vector and the documents that contain the query terms
    for docID in relevant_docs:
        distance = compute_euclidean_distance(normalized_query, term_weights_matrix[docID])
        distance_dict[docID].append(distance)
    
    ##Sort the documents by the distance
    sorted_distance_dict = sorted(distance_dict.items(), key=operator.itemgetter(1), reverse=True)
   
    print(sorted_distance_dict)
    most_similar_docs = list()
                                  
    for key, value in sorted_distance_dict:
        most_similar_docs.append(key)
                                  
    return most_similar_docs

In [113]:
def get_topk_docs(query, doc_dict, k):
    
    return get_most_similar_docs(query, doc_dict)[0:k]

In [12]:
#get_most_similar_docs("stopping heart disease in childhood", matrix_test)

In [109]:
#get_topk_docs("stopping heart disease in childhood", matrix_test, 24)