# Load CFCorpus dataset

In [471]:
# store each doc in a seperated file
doc_id_dict = {}
raw_document = []

with open('./NFCorpus/test.docs', 'r') as f:
    lines = f.readlines()
    index = 0
    for line in lines:
        id, doc = line.split('\t')
        
        doc_id_dict[id] = index
        index += 1
        raw_document.append(doc)
        
    f.close()

# Preprocessing

In [472]:
import nltk
import os
from nltk.stem import PorterStemmer
import string
from nltk.corpus import stopwords


term_freq = {}# số lần xuất hiện của các term cả tài liệu Cranfield
doc_freq = {}# số tài liệu xuất hiện của các term mà ta xét
ps = PorterStemmer()
noise = string.punctuation + '0123456789'


nltk.download('stopwords')
stop_words = stopwords.words('english')

def preProcessing(content):
  # loại bỏ ký hiệu và số tách thành mãng
  words = content.translate(str.maketrans("", "", noise)).split()
  # stemming
  words = [ps.stem(i) for i in words]
  # loại bỏ stopwords
  words = [word for word in words if word not in stop_words]
  return words




[nltk_data] Downloading package stopwords to /Users/macos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [473]:
document = []
for doc in raw_document:
    document.append(preProcessing(doc))


In [474]:
document[0]

['statin',
 'breast',
 'cancer',
 'surviv',
 'nationwid',
 'cohort',
 'studi',
 'finland',
 'abstract',
 'recent',
 'studi',
 'suggest',
 'statin',
 'establish',
 'drug',
 'group',
 'prevent',
 'cardiovascular',
 'mortal',
 'delay',
 'prevent',
 'breast',
 'cancer',
 'recurr',
 'effect',
 'diseasespecif',
 'mortal',
 'remain',
 'unclear',
 'evalu',
 'risk',
 'breast',
 'cancer',
 'death',
 'statin',
 'user',
 'populationbas',
 'cohort',
 'breast',
 'cancer',
 'patient',
 'studi',
 'cohort',
 'includ',
 'newli',
 'diagnos',
 'breast',
 'cancer',
 'patient',
 'finland',
 'num',
 'num',
 'num',
 'case',
 'identifi',
 'finnish',
 'cancer',
 'registri',
 'inform',
 'statin',
 'diagnosi',
 'obtain',
 'nation',
 'prescript',
 'databas',
 'cox',
 'proport',
 'hazard',
 'regress',
 'method',
 'estim',
 'mortal',
 'statin',
 'user',
 'statin',
 'timedepend',
 'variabl',
 'total',
 'num',
 'particip',
 'statin',
 'median',
 'followup',
 'num',
 'year',
 'diagnosi',
 'rang',
 'num',
 'num',
 'year

# LSI MODEL

## Decomposition

In [475]:
#import modules
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import TfidfModel
import matplotlib.pyplot as plt
import numpy as np


def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our corpus and convert the list of documents (corpus) into Document Term Matrix using tf-idf weighting
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our corpus, where every unique term is assigned an index.
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

    # Apply tf-idf weighting to the Document Term Matrix
    tfidf = TfidfModel(doc_term_matrix)
    doc_term_matrix_tfidf = tfidf[doc_term_matrix]

    return dictionary, doc_term_matrix_tfidf

# Pass doc_clean as the parameter to prepare_corpus function
dictionary, doc_term_matrix = prepare_corpus(document)


In [476]:
# Convert doc_term_matrix into a dense numpy array
num_terms = len(dictionary)
num_docs = len(doc_term_matrix)
matrix = np.zeros((num_terms, num_docs), dtype=np.float32)  # Transpose the dimensions

for i, doc in enumerate(doc_term_matrix):
    for term_id, term_freq in doc:
        matrix[term_id, i] = term_freq

In [477]:
matrix.shape

(19424, 3162)

In [478]:
def svd_decomposition(matrix):
    """
    Perform Singular Value Decomposition (SVD) on the given matrix M.

    Parameters:
        matrix: Input matrix of shape (m x n)

    Returns:
        S: Matrix of left singular vectors of shape (m x n)
        sigma: Singular values as a 1-D array of length min(n, n)
        U: Transpose of the matrix of right singular vectors of shape (n x n)
    """
    # Perform SVD
    S, sigma, U_t = np.linalg.svd(matrix, full_matrices=False)


    return S, sigma, U_t

In [479]:
S, sigma, U_t = svd_decomposition(matrix)

## Find optimal k

In [480]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        model = LsiModel(doc_term_matrix, num_topics= num_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [481]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

start,stop,step=100,1000,100
plot_graph(document, start, stop, step)

In [482]:
k = 450

## Reduce dimension

In [483]:
def reduce_dimension(S, sigma, UT, k):
    """
    Reduce the dimensionality of matrices U, S, and V based on a given value of k.

    Parameters:
        U: Matrix of left singular vectors of shape (m x n)
        S: Singular values as a 1-D array of length n
        V: Matrix of right singular vectors of shape (n x n)
        k: Number of singular values/vectors to retain

    Returns:
        U_red: Reduced matrix U of shape (m x k)
        S_red: Reduced array of singular values of length k
        V_red: Reduced matrix V of shape (k x n)
    """
    S_red = S[:, :k]
    sigma_red = sigma[:k]
    UT_red = UT[:k, :]
    return S_red, sigma_red, UT_red

In [484]:
S_red, sigma_red, UT_red = reduce_dimension(S, sigma, U_t, k)

# Calculate query matrix

In [485]:
import re
# store each doc in a seperated file
raw_query = {}

with open('./NFCorpus/test.all.queries', 'r') as f:
    lines = f.readlines()    
    for line in lines:
        id, query = line.split('\t')        
                
        raw_query[id] = query
        
    f.close()

In [486]:
# Sort the queries based on the numeric part of the IDs in ascending order
sorted_queries = [content for _, content in sorted(raw_query.items(), key=lambda x: int(re.findall(r'\d+', x[0])[0]))]

In [487]:
queries = []
for query in sorted_queries:        
    preProcess_query = preProcessing(query)
    query_new = " ".join(preProcess_query)
    queries.append(query_new)
    

In [499]:
queries[0]

'cholesterol statin drug caus breast cancer doe breast cancer cholesterol mani potenti mechan cholesterol boost breast cancer growth exampl bodi make estrogen correl increas risk breast cancer cholesterol also packag cholesterol ldl see video cholesterol feed breast cancer cell appear increas cancer prolifer decreas patient surviv cholesterol major compon “ lipid raft ” compar normal counterpart cancer cell higher level cholesterolrich lipid raft plasma membran may import cancer cell surviv may serv human cancer develop term tumor migrat invas elev level cholesterolrich lipid raft found breast cancer cell hypothesi reduc blood cholesterol level “ may disrupt lipid raft format therebi inhibit breast cancer develop ” thi suggest cholesterol target may use cancer therapi control laboratori experi shown phytosterol seed nut dietari relev level appear inhibit growth sever type tumor cell includ breast cancer cell includ estrogenreceptor neg estrogenreceptor posit cancer therapeut implic “ p

In [489]:
def cal_vector_K(S_red, sigma_red):
  vector_K = np.dot(S_red, np.diag(sigma_red))

  return vector_K

def cal_vector_D(sigma_red, UT_red):
  vector_D = np.dot(np.diag(sigma_red), UT_red)

  return vector_D

In [490]:
D = cal_vector_D(sigma_red, UT_red)
K = cal_vector_K(S_red, sigma_red)

In [491]:
def cal_vector_querry(query, K):
    query_terms = query.split()
    query_vector = np.zeros(K.shape[1])  # Initialize query vector as zeros

    for term in query_terms:
        if term in dictionary.token2id:
            term_index = dictionary.token2id[term]
            query_vector += K[term_index]

    return query_vector

In [492]:
list_q = []
for query in queries:
  q = cal_vector_querry(query, K)
  list_q.append(q)


# CALCULATE SIMILARITY

In [493]:
import numpy as np
from numpy.linalg import norm

def calculate_cosine_similarity(query, document):
    # Compute the dot product of the query and document vectors
    dot_product = np.dot(query, document)

    # Compute the norms of the query and document vectors
    query_norm = norm(query)
    document_norm = norm(document)

    # Compute the cosine similarity
    similarity = dot_product / (query_norm * document_norm)

    return similarity

In [494]:
predict_results = []
k = 12
for q_idx in range(0, len(list_q)):
  doc_similarity = []

  for doc_idx in range(0, len(document)):
    sim = calculate_cosine_similarity(list_q[q_idx], D[:, doc_idx])
    doc_similarity.append((doc_idx, sim))

  ranked_doc = sorted(doc_similarity, key=lambda x: x[1], reverse=True)
  top_k_docs = [item[0] for item in ranked_doc[:k]]

  predict_results.append(top_k_docs)
  # predict_results.append(ranked_doc)

In [495]:
len(predict_results)

323

# EVALUATING

In [496]:
result_file = "./NFCorpus/test.2-1-0.qrel"  # Replace with the actual path to your result file

query_doc_ids = {}
with open(result_file, "r") as file:
    for line in file:
        line = line.strip().split("\t")
        query_id = line[0]
        doc_id = line[2]
        if query_id in query_doc_ids:
            query_doc_ids[query_id].append(doc_id)
        else:
            query_doc_ids[query_id] = [doc_id]

In [497]:
real_results = []
for res in query_doc_ids:
    converted_doc_id = []    
    for doc_id in query_doc_ids[res]:
        converted_doc_id.append(doc_id_dict[doc_id])
    
    real_results.append(converted_doc_id)

In [498]:
# Kiểm tra tài liều đó có nằm trong tài liệu thực không truy vẫn thực không ---> danh sách true false của tài liệu truy vấn của mình(OK)
# tính từng cặp r và p ----> bảng r p
# tính AP của từng câu truy vấn
# cuối cùng tính MAP
# lấy dữ liệu RES để tính MAP nhưng chưa xong--------


#Hàm lấy index true
def index_true(lis):
  return [i for i in range(len(lis)) if lis[i] == True]

# Hàm này dùng để tính R và P của các câu truy vấn
def RP(real_results, predict_results):

#Tính R và P của từng câu truy vấn
  check_results = [[predict_results[i][j] in real_results[i] for j in range(len(predict_results[i]))] for i in range(len(predict_results))]# kiểm tra xem câu tìm được có đúng kết quả không
  index_true_results = [index_true(i) for i in check_results]#đây là index của các dự đoán đúng dùng để tính độ chính xác Precision
  len_results = [len(i) for i in real_results]#số kết quả "thực" tìm được của mỗi câu truy vấn để tính độ phủ Recall
  R_P_results = [[((j+1)/len_results[i], (j+1)/(index_true_results[i][j]+1)) for j in range(len(index_true_results[i]))] for i in range(len(index_true_results))]#kết quả R và P của mỗi câu truy vấn
  return R_P_results

def Recall(real_results, predict_results):
  check_results = [[predict_results[i][j] in real_results[i] for j in range(len(predict_results[i]))] for i in range(len(predict_results))]# kiểm tra xem câu tìm được có đúng kết quả không
  len_results = [len(i) for i in real_results]#số kết quả "thực" tìm được của mỗi câu truy vấn
  recall = [(check_results[i].count(True))/len_results[i] for i in range(len(check_results)) if check_results[i]]
  return sum(recall)/len(real_results)

def Precision(real_results, predict_results):
  check_results = [[predict_results[i][j] in real_results[i] for j in range(len(predict_results[i]))] for i in range(len(predict_results))]# kiểm tra xem câu tìm được có đúng kết quả không
  len_predict = [len(i) for i in predict_results]#số kết quả "dự đoán" tìm được của mỗi câu truy vấn
  precision = [(check_results[i].count(True))/len_predict[i] for i in range(len(check_results)) if check_results[i]]
  return sum(precision)/len(real_results)

R_P_results = RP(real_results, predict_results)



# đây là 11 điểm nội suy của TREC
R = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]

#Tính AP của từng câu truy vấn từ R_P_results
# P nội suy là Max(Pr') trong đó r'>=r(đang xét)

#hàm check_r dùng để kiểm tra và lấy ra index của r trong rp
# Lấy index của R đang xét để tìm giá trị P max từ index đó trở về sau
def check_r(r, rp):
  for i in range(len(rp)):
    if rp[i][0]>=r:
      return i
  return -1
# Lấy P max
def max_P(check, rp):
  maxx = 0
  for i in rp[check:]:
    if i[1]>maxx:
      maxx = i[1]
  return maxx


def MAP_11(R_P_results, R):

  AP_results = []
  for i in range(len(R_P_results)):
    ap = 0
    for j in range(11):
      Check = check_r(R[j],R_P_results[i])
      if Check != -1:
        ap = ap + max_P(Check,R_P_results[i])
    AP_results.append(ap/11)

  MAP = sum(AP_results)/225
  return MAP

MAP_11(R_P_results, R), Recall(real_results, predict_results), Precision(real_results, predict_results)


(0.0811179325724781, 0.08044672827609692, 0.13235294117647062)