In [45]:
import numpy as np
import pandas as pd
from collections import defaultdict

In [65]:
class LinkedList:
    def __init__(self,val = "",next = None):
        self.val = val
        self.next = next

### Term document incidence matrix

In the term document incidence matrix the rows represent the distinct terms and the columns represent the documents present in the collection

In [46]:
Doc1 = "tigers are my favorite among various animals"
Doc2 =  "tigers generally run faster than humans"
Doc3 = "animals are generally stronger than humans"

In [47]:
docs = [Doc1, Doc2, Doc3]
docs

['tigers are my favorite among various animals',
 'tigers generally run faster than humans',
 'animals are generally stronger than humans']

In [48]:
unique_terms = set()
for doc in docs:
    for term in doc.split():
        unique_terms.add(term)
unique_terms

{'among',
 'animals',
 'are',
 'faster',
 'favorite',
 'generally',
 'humans',
 'my',
 'run',
 'stronger',
 'than',
 'tigers',
 'various'}

In [49]:
doc_term_matrix = {}
for term in unique_terms:
    doc_term_matrix[term] = []
    for doc in docs:
        if term in doc:
            doc_term_matrix[term].append(1)
        
        else:
            doc_term_matrix[term].append(0)
doc_term_matrix

{'among': [1, 0, 0],
 'tigers': [1, 1, 0],
 'my': [1, 0, 0],
 'animals': [1, 0, 1],
 'are': [1, 0, 1],
 'faster': [0, 1, 0],
 'than': [0, 1, 1],
 'favorite': [1, 0, 0],
 'generally': [0, 1, 1],
 'various': [1, 0, 0],
 'stronger': [0, 0, 1],
 'run': [0, 1, 0],
 'humans': [0, 1, 1]}

In [50]:
docs_arr = np.array(docs,dtype='object')
v1 = np.array(doc_term_matrix['tigers'])
v2 = np.array(doc_term_matrix['animals'])

In [51]:
print(v1," ",v2)

[1 1 0]   [1 0 1]


In [52]:
# Documents containing tigers and animals
v1 & v2

array([1, 0, 0])

In [53]:
v3 = np.array(doc_term_matrix['faster'])
v3

array([0, 1, 0])

In [54]:
# Documents containing tigers and faster
v1 &  v3

array([0, 1, 0])

### Inverted Index

In [55]:
docs

['tigers are my favorite among various animals',
 'tigers generally run faster than humans',
 'animals are generally stronger than humans']

In [56]:
tokens_doc_1 = Doc1.split()
tokens_doc_2 = Doc2.split()
tokens_doc_3 = Doc3.split()

In [57]:
tokens_doc_1

['tigers', 'are', 'my', 'favorite', 'among', 'various', 'animals']

In [58]:
tokens_doc_2

['tigers', 'generally', 'run', 'faster', 'than', 'humans']

In [59]:
tokens_doc_3

['animals', 'are', 'generally', 'stronger', 'than', 'humans']

In [60]:
all_tokens = tokens_doc_1 + tokens_doc_2 + tokens_doc_3

In [61]:
all_tokens

['tigers',
 'are',
 'my',
 'favorite',
 'among',
 'various',
 'animals',
 'tigers',
 'generally',
 'run',
 'faster',
 'than',
 'humans',
 'animals',
 'are',
 'generally',
 'stronger',
 'than',
 'humans']

In [63]:
all_tokens.sort()

In [68]:
all_tokens = sorted(list(set(all_tokens))) 

In [69]:
all_tokens

['among',
 'animals',
 'are',
 'faster',
 'favorite',
 'generally',
 'humans',
 'my',
 'run',
 'stronger',
 'than',
 'tigers',
 'various']

In [72]:
res = {}
for token in all_tokens:
    res[token] = []
    if token in tokens_doc_1:
        res[token].append("Document 1")
    
    if token in tokens_doc_2:
        res[token].append("Document 2")
    
    if token in tokens_doc_3:
        res[token].append("Document 3")

In [71]:
res

{'among': ['Document 1'],
 'animals': ['Document 1', 'Document 3'],
 'are': ['Document 1', 'Document 3'],
 'faster': ['Document 2'],
 'favorite': ['Document 1'],
 'generally': ['Document 2', 'Document 3'],
 'humans': ['Document 2', 'Document 3'],
 'my': ['Document 1'],
 'run': ['Document 2'],
 'stronger': ['Document 3'],
 'than': ['Document 2', 'Document 3'],
 'tigers': ['Document 1', 'Document 2'],
 'various': ['Document 1']}