In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

print("Boolean Retrieval Model Using Bitwise operations on Term Document Incidence Matrix")

# Sort the corpus to maintain consistent order of documents
corpus = ['this is a document', 'this document is the second document', 
          'an this is the third document', 'Is this is the First Document']
corpus.sort()

print(f"This is corpus: {corpus}")

vectorizer = CountVectorizer()
x = vectorizer.fit_transform(corpus)
df = pd.DataFrame(x.toarray(), columns=vectorizer.get_feature_names_out())

print("This generated data frame:")
print(df)
print("Query processing on term document incidence matrix\n")

# AND
print("1. Find all document indices for query 'this' AND 'first'")
alldata = df[(df['this'] == 1) & (df['first'] == 1)]
print(f"Document indices where both 'this' AND 'first' are present are: {alldata.index.tolist()}\n")

# OR
print("2. Find all document indices for query 'this' OR 'first'")
alldata = df[(df['this'] == 1) | (df['first'] == 1)]
print(f"Document indices where either 'this' OR 'first' are present are: {alldata.index.tolist()}\n")

# NOT
print("3. Find all document indices for query 'NOT' 'is'")
alldata = df[(df['is'] == 0)]
print(f"Document indices where 'is' term is not present are: {alldata.index.tolist()}\n")


# Note :-
# in this the documents are 3,2,1,0


Boolean Retrieval Model Using Bitwise operations on Term Document Incidence Matrix
This is corpus: ['Is this is the First Document', 'an this is the third document', 'this document is the second document', 'this is a document']
This generated data frame:
   an  document  first  is  second  the  third  this
0   0         1      1   2       0    1      0     1
1   1         1      0   1       0    1      1     1
2   0         2      0   1       1    1      0     1
3   0         1      0   1       0    0      0     1
Query processing on term document incidence matrix

1. Find all document indices for query 'this' AND 'first'
Document indices where both 'this' AND 'first' are present are: [0]

2. Find all document indices for query 'this' OR 'first'
Document indices where either 'this' OR 'first' are present are: [0, 1, 2, 3]

3. Find all document indices for query 'NOT' 'is'
Document indices where 'is' term is not present are: []

