### Boolean

In [None]:
documents = [
    "term1 term2 term3 term4 term5",
    "term1 term3 term5 term6",
    "term2 term3 term4",
    "term1 term2 term3",
    "term4 term5 term6",
]

In [None]:
terms = ["term1", "term2", "term3"]
query_and = "term1 AND term3"
query_or = "term1 OR term2"

In [None]:
matrix = []
for doc in documents:
    row = []
    for term in terms:
        if term in doc:
            row.append(1)
        else:
            row.append(0)
    matrix.append(row)

In [None]:
matrix

[[1, 1, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1], [0, 0, 0]]

In [None]:
query_terms_and = query_and.split(" AND ")
matching_docs_and = []
for i, term in enumerate(terms):
    if term in query_terms_and:
        if not matching_docs_and:
            matching_docs_and = [j for j, row in enumerate(matrix) if row[i] == 1]
        else:
            matching_docs_and = [j for j in matching_docs_and if matrix[j][i] == 1]

query_terms_or = query_or.split(" OR ")
matching_docs_or = []
for i, term in enumerate(terms):
    if term in query_terms_or:
        matching_docs_or += [j for j, row in enumerate(matrix) if row[i] == 1]

In [None]:
print("Matching Documents for AND Operation:")
for doc_index in matching_docs_and:
    print(f"Doc{doc_index + 1}: {documents[doc_index]}")

print("\nMatching Documents for OR Operation:")
for doc_index in set(matching_docs_or):
    print(f"Doc{doc_index + 1}: {documents[doc_index]}")

Matching Documents for AND Operation:
Doc1: term1 term2 term3 term4 term5
Doc2: term1 term3 term5 term6
Doc4: term1 term2 term3

Matching Documents for OR Operation:
Doc1: term1 term2 term3 term4 term5
Doc2: term1 term3 term5 term6
Doc3: term2 term3 term4
Doc4: term1 term2 term3


### Extended Boolean Model

In [None]:
import pandas as pd

keywords = ["mount", "everest", "earth", "mountain", "kalsubai", "fuji"]
documents = [
    """D1: Mount Everest is Earth highest mountain above sea level located in the subrange of the
     Himalayas. Mount Everest attracts many climbers, some of them are highly experienced mountaineers.""",
    """D2: Kalsubai is a is mountain in the western Ghats located in the Indian State.
    The mountain range lies within the Kalsubai Harishcndragad wildlife sanctuary.""",
    "D3: Mount Fuji is a very distinctive feature of the geography of Japan. The mountain stands about 100km."
]
query = "mount kalsubai"

document_terms = []
for doc in documents:
    doc_terms = doc.lower().split()
    document_terms.append(doc_terms)

query_terms = query.split()

freq_matrix = []
for term in keywords:
    term_freqs = []
    for doc_terms in document_terms:
        term_freq = doc_terms.count(term)
        term_freqs.append(term_freq)
    freq_matrix.append(term_freqs)

df = pd.DataFrame(freq_matrix, columns=[f"D{i+1}" for i in range(len(documents))], index=keywords)

query_presence = [1 if term in query_terms else 0 for term in keywords]
df['Query'] = query_presence

max_values = df.max()

# Divide each maximum value by all terms in the respective column
new_df = df.divide(max_values, axis=1)
print(new_df)


           D1   D2   D3  Query
mount     1.0  0.0  1.0    1.0
everest   1.0  0.0  0.0    0.0
earth     0.5  0.0  0.0    0.0
mountain  0.5  1.0  1.0    0.0
kalsubai  0.0  1.0  0.0    1.0
fuji      0.0  0.0  1.0    0.0


In [None]:
temp_df = new_df.copy()
temp_df = temp_df[["D1", "D2", "D3"]]

In [None]:
temp_df

Unnamed: 0,D1,D2,D3
mount,1.0,0.0,1.0
everest,1.0,0.0,0.0
earth,0.5,0.0,0.0
mountain,0.5,1.0,1.0
kalsubai,0.0,1.0,0.0
fuji,0.0,0.0,1.0


In [None]:
for term in keywords:
    count = 0
    for doc in ["D1", "D2", "D3"]:
        if new_df.at[term, doc] > 0:
            count += 1
    new_df.at[term, 'Count'] = count

new_df['Count'] = new_df['Count'].astype(int)


print("Updated DataFrame with Count:")
print(new_df)


Updated DataFrame with Count:
           D1   D2   D3  Query  Count
mount     1.0  0.0  1.0    1.0      2
everest   1.0  0.0  0.0    0.0      1
earth     0.5  0.0  0.0    0.0      1
mountain  0.5  1.0  1.0    0.0      3
kalsubai  0.0  1.0  0.0    1.0      1
fuji      0.0  0.0  1.0    0.0      1


In [None]:
import math

cc = new_df['Count']
N = 3
idf = []

for count in cc:
    res = math.log(1 + (N/count), 10)
    res = round(res, 3)
    idf.append(res)

print(idf)

new_df['idf'] = idf

print('df after idf')
print(new_df)


[0.398, 0.602, 0.602, 0.301, 0.602, 0.602]
df after idf
           D1   D2   D3  Query  Count    idf
mount     1.0  0.0  1.0    1.0      2  0.398
everest   1.0  0.0  0.0    0.0      1  0.602
earth     0.5  0.0  0.0    0.0      1  0.602
mountain  0.5  1.0  1.0    0.0      3  0.301
kalsubai  0.0  1.0  0.0    1.0      1  0.602
fuji      0.0  0.0  1.0    0.0      1  0.602


In [None]:
max_idf = max(new_df['idf'])
new_df['Gi'] = new_df['idf'] / max_idf

print("Updated DataFrame with Gi column:")
print(new_df)

Updated DataFrame with Gi column:
           D1   D2   D3  Query  Count    idf       Gi
mount     1.0  0.0  1.0    1.0      2  0.398  0.66113
everest   1.0  0.0  0.0    0.0      1  0.602  1.00000
earth     0.5  0.0  0.0    0.0      1  0.602  1.00000
mountain  0.5  1.0  1.0    0.0      3  0.301  0.50000
kalsubai  0.0  1.0  0.0    1.0      1  0.602  1.00000
fuji      0.0  0.0  1.0    0.0      1  0.602  1.00000


In [None]:
new_df_copy = new_df.drop(['Count', 'idf'], axis=1)
print(new_df_copy)


           D1   D2   D3  Query       Gi
mount     1.0  0.0  1.0    1.0  0.66113
everest   1.0  0.0  0.0    0.0  1.00000
earth     0.5  0.0  0.0    0.0  1.00000
mountain  0.5  1.0  1.0    0.0  0.50000
kalsubai  0.0  1.0  0.0    1.0  1.00000
fuji      0.0  0.0  1.0    0.0  1.00000


In [None]:
import numpy as np

In [None]:
new_df_copy.loc[:, new_df_copy.columns != 'Gi'] *= new_df_copy['Gi'].values[:, np.newaxis]
print(new_df_copy)

               D1   D2       D3    Query       Gi
mount     0.66113  0.0  0.66113  0.66113  0.66113
everest   1.00000  0.0  0.00000  0.00000  1.00000
earth     0.50000  0.0  0.00000  0.00000  1.00000
mountain  0.25000  0.5  0.50000  0.00000  0.50000
kalsubai  0.00000  1.0  0.00000  1.00000  1.00000
fuji      0.00000  0.0  1.00000  0.00000  1.00000


In [None]:
new_df_copy = new_df_copy.drop(['Gi', 'Query'], axis=1)
print(new_df_copy)

               D1   D2       D3
mount     0.66113  0.0  0.66113
everest   1.00000  0.0  0.00000
earth     0.50000  0.0  0.00000
mountain  0.25000  0.5  0.50000
kalsubai  0.00000  1.0  0.00000
fuji      0.00000  0.0  1.00000


In [None]:
query_term_values = new_df_copy.loc[['mount', 'kalsubai']]

print("Values for query terms:")
print(query_term_values)


Values for query terms:
               D1   D2       D3
mount     0.66113  0.0  0.66113
kalsubai  0.00000  1.0  0.00000


In [None]:
transposed_df = query_term_values.T
transposed_df

Unnamed: 0,mount,kalsubai
D1,0.66113,0.0
D2,0.0,1.0
D3,0.66113,0.0
