In [6]:
import os
from collections import defaultdict
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Set directory containing the documents
directory = "/path/to/documents"

# Initialize stemmer
stemmer = PorterStemmer()

# Initialize dictionary to store term counts for each document
doc_term_counts = defaultdict(lambda: defaultdict(int))

# Initialize set to store unique terms
unique_terms = set()

# Loop through each file in the directory
for filename in os.listdir(directory):
    with open(os.path.join(directory, filename), "r") as f:
        # Tokenize the document
        tokens = word_tokenize(f.read())
        
        # Stem each token and update the term counts for the document
        for token in tokens:
            stemmed_token = stemmer.stem(token.lower())
            doc_term_counts[filename][stemmed_token] += 1
            unique_terms.add(stemmed_token)

# Sort the unique terms in alphabetical order
sorted_terms = sorted(list(unique_terms))

# Create the document by term matrix
doc_term_matrix = []
for doc in doc_term_counts:
    row = [doc_term_counts[doc][term] for term in sorted_terms]
    doc_term_matrix.append(row)


FileNotFoundError: [WinError 3] The system cannot find the path specified: '/path/to/documents'

In [1]:
import numpy as np

def modify_query(Q, R, NR, alpha, beta):
    # compute the centroid vectors for relevant and non-relevant documents
    R_centroid = np.mean(R, axis=0)
    NR_centroid = np.mean(NR, axis=0)
    
    # update the query vector using Rocchio's formula
    Q_new = alpha * Q + beta * R_centroid - beta * NR_centroid
    
    return Q_new


In [2]:
Q = np.array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0])
R = np.array([[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
              [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
              [0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0],
              [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]])
NR = np.array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
              [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
              [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0]])

alpha = 0.5
beta = 0.25

Q_new = modify_query(Q, R, NR, alpha, beta)

print("Original query: ", Q)
print("New query: ", Q_new)


Original query:  [0 0 0 0 0 1 0 0 0 0 1 2 0 0 0]
New query:  [ 0.125      -0.02083333 -0.10416667  0.0625      0.0625      0.52083333
  0.0625      0.0625      0.0625      0.0625      0.47916667  0.89583333
  0.          0.          0.0625    ]


In [None]:
import numpy as np

def compute_cosine_similarity(dtm, q):
    # Compute the dot product of the document-term matrix and the query vector
    dot_product = np.dot(dtm, q)

    # Compute the Euclidean length of the document-term matrix and the query vector
    dtm_norm = np.sqrt(np.sum(dtm ** 2, axis=1))
    q_norm = np.sqrt(np.sum(q ** 2))

    # Compute the cosine similarity between the document-term matrix and the query vector
    cosine_sim = dot_product / (dtm_norm * q_norm)

    return cosine_sim

# Compute the ranked list of returned documents for Q
q = np.array([0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 3, 0, 0, 0])
cosine_sim = compute_cosine_similarity(dtm, q)
ranked_list_Q = np.argsort(cosine_sim)[::-1]

# Compute the ranked list of returned documents for Q1
q1 = np.array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1])
cosine_sim = compute_cosine_similarity(dtm, q1)
ranked_list_Q1 = np.argsort(cosine_sim)[::-1]


In [3]:
import numpy as np

def compute_t2t_association_matrix(doc_term_matrix):
    # compute the transpose of document-term matrix
    term_doc_matrix = doc_term_matrix.T
    
    # compute the term-term matrix
    t2t_matrix = term_doc_matrix.dot(doc_term_matrix)
    
    # compute the diagonal matrix of term frequency
    diag_tf = np.diag(np.sum(doc_term_matrix, axis=0))
    
    # compute the Jaccard coefficient
    jaccard_coeff = np.divide(t2t_matrix, diag_tf + diag_tf.T - t2t_matrix)
    
    # set NaN and diagonal elements to 0
    jaccard_coeff = np.nan_to_num(jaccard_coeff)
    np.fill_diagonal(jaccard_coeff, 0)
    
    # normalize the matrix
    norm_factor = np.linalg.norm(jaccard_coeff, axis=1, keepdims=True)
    norm_jaccard_coeff = np.divide(jaccard_coeff, norm_factor)
    
    return norm_jaccard_coeff


In [4]:
# example document-term matrix
doc_term_matrix = np.array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                            [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
                            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0],
                            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]])

# compute term-by-term association matrix
t2t_matrix = compute_t2t_association_matrix(doc_term_matrix)

# display the result
print(t2t_matrix)


[[ 0.          0.          0.          0.         -1.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.         -0.57735027 -0.57735027  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.         -0.57735027]
 [ 0.         -0.70710678  0.         -0.70710678  0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.         -0.70710678 -0.70710678  0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [-1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.
  -0.4472136  -0.4472136  -0.4472136  -0.4472136  -0.4472136   0.
   0.        

  jaccard_coeff = np.divide(t2t_matrix, diag_tf + diag_tf.T - t2t_matrix)


In [5]:
import heapq

def expand_query(S, q, n):
    expanded_query = set(q)
    for term in q:
        # Get the n most similar terms to the current term
        similarities = S[term]
        most_similar = heapq.nlargest(n, similarities, key=similarities.get)
        expanded_query.update(most_similar)
    return list(expanded_query - set(q))

