In [8]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
# nltk.download('stopwords')
# nltk.download('punkt')

# 讀取數據
document_path = '/kaggle/input/ir-hw1/documents_data.csv'
train_path = '/kaggle/input/ir-hw1/train_question.csv'
test_path = '/kaggle/input/ir-hw1/test_question.csv'
documents_df = pd.read_csv(document_path, sep=',') # , nrows = 40
train_df = pd.read_csv(train_path, sep=',')
test_df = pd.read_csv(test_path, sep=',')
# test_df = test_df[20:40]

print("document from:", document_path)
print("train from:", train_path)
print("test from:", test_path)

# 文本預處理
stop_words = set(stopwords.words('english'))
stop_words.update(["h1", "h2", "h3", "h4", "h5", "td", "tr", "li", "ul", "table", "p"])

def preprocess(text, stop_words):
  text = text.lower()  # To lowercase
  text = re.sub(r'\W+', ' ', text)  # Remove non-alphanumeric characters
  tokens = word_tokenize(text)
  tokens = [token for token in tokens if token not in stop_words]
  return tokens

documents_df['tokens'] = documents_df['Document_HTML'].apply(lambda x: preprocess(x, stop_words))
train_df['tokens'] = train_df['Question'].apply(lambda x: preprocess(x, stop_words))
test_df['tokens'] = test_df['Question'].apply(lambda x: preprocess(x, stop_words))

print("preprocess finish")

# print("documents_df : \n", documents_df, "\n")
# print("train_df : \n", train_df, "\n")
# print("test_df : \n", test_df, "\n")

# 建立詞彙表
def build_vocab(documents):
  vocab = {}
  idx = 0
  for doc in documents:
    for token in doc:
      if token not in vocab:
        vocab[token] = idx
        idx += 1
  return vocab

combined_tokens = documents_df['tokens'].tolist()
combined_tokens.extend(train_df['tokens'].tolist())
# print(combined_tokens)
vocab = build_vocab(combined_tokens)

# print("vocab : \n", vocab, "\n")

# 計算Term Frequency
def compute_tf(doc_tokens, vocab):
  tf_vector = np.zeros(len(vocab))
  word_count = Counter(doc_tokens)
  for token, count in word_count.items():
    if token in vocab:
      idx = vocab[token]
      tf_vector[idx] = count
  return tf_vector

# 計算Inverse Document Frequency
def compute_idf(doc_tokens_list, vocab):
  num_docs = len(doc_tokens_list)
  idf_vector = np.zeros(len(vocab))
  for tokens in doc_tokens_list:
    token_set = set(tokens)
    for token in token_set:
      if token in vocab:
        idx = vocab[token]
        idf_vector[idx] += 1
  idf_vector = np.log((num_docs - idf_vector + 0.5) / (idf_vector + 0.5) + 1)
  return idf_vector

idf_vector = compute_idf(documents_df['tokens'], vocab)

document from: /kaggle/input/ir-hw1/documents_data.csv
train from: /kaggle/input/ir-hw1/train_question.csv
test from: /kaggle/input/ir-hw1/train_question.csv
preprocess finish


In [9]:
#####################
# For BM25
# BM25分數計算
def bm25(query, doc_tokens, idf_vector, vocab, avg_doc_len, k1=1.5, b=0.75):
  tf_vector = compute_tf(doc_tokens, vocab)
  query_vector = compute_tf(query, vocab)

  bm25_score = 0
  for i in range(len(vocab)):
    if query_vector[i] > 0:
      idf = idf_vector[i]
      tf = tf_vector[i]
      bm25_score += idf * ((tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (len(doc_tokens) / avg_doc_len))))
  return bm25_score

# 計算BM25分數並回傳前3個最相似的文件
def get_top_3_similar_docs_bm25(documents_df, query_tokens, vocab, idf_vector, avg_doc_len):
    bm25_scores = [bm25(query_tokens, doc, idf_vector, vocab, avg_doc_len) for doc in documents_df['tokens']]
    top_3_bm25_idx = np.argsort(-np.array(bm25_scores))[:3]  # 取BM25分數最高的3個
    top_3_docs = documents_df.iloc[top_3_bm25_idx]['Document ID'].values
    print(top_3_docs)
    return " ".join(map(str, top_3_docs))

# 計算每個測試問題對應的最相似文檔
avg_doc_len = sum(len(doc) for doc in documents_df['tokens']) / len(documents_df)
test_df['answer_bm25'] = test_df['tokens'].apply(lambda q: get_top_3_similar_docs_bm25(documents_df, q, vocab, idf_vector, avg_doc_len))

# 將結果保存為CSV
output_df = pd.DataFrame({
  'index': test_df['Question ID'],
  'answer': test_df['answer_bm25']
})

output_df.to_csv('output_bm25.csv', index=False)

[21 11  1]
[22 39 14]
[23 38 29]
[24 40 15]
[25 15 16]
[26 33 12]
[27 23 29]
[28 12 26]
[29  5 14]
[30 38  4]
[31  1 38]
[32 17  2]
[33  9 10]
[34  4 22]
[35 25 33]
[36 32 33]
[37 18 25]
[38  2 30]
[39 22 20]
[40 30 38]


In [10]:
########################
# For Vector Model
# 把tokens轉換成vector
def compute_tfidf(doc_tokens, vocab, idf_vector):
  tf_vector = compute_tf(doc_tokens, vocab)
  tfidf_vector = tf_vector * idf_vector
  return tfidf_vector

document_tfidf_vectors = np.array([compute_tfidf(doc, vocab, idf_vector) for doc in documents_df['tokens']])
test_tfidf_vectors = np.array([compute_tfidf(q, vocab, idf_vector) for q in test_df['tokens']])

# 計算相似度
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  if norm_vec1 == 0 or norm_vec2 == 0:
    return 0.0
  return dot_product / (norm_vec1 * norm_vec2)

def get_top_3_similar_docs_tfidf(test_tfidf_vectors, document_tfidf_vectors, documents_df):
  similar_docs = []
  for test_vec in test_tfidf_vectors:
    similarities = [cosine_similarity(test_vec, doc_vec) for doc_vec in document_tfidf_vectors]
    top_3_idx = np.argsort(-np.array(similarities))[:3]
    top_3_docs = documents_df.iloc[top_3_idx]['Document ID'].values
    print(top_3_docs)
    similar_docs.append(" ".join(map(str, top_3_docs)))
  return similar_docs

test_df['answer_tfidf'] = get_top_3_similar_docs_tfidf(test_tfidf_vectors, document_tfidf_vectors, documents_df)

# 輸出結果
output_df = pd.DataFrame({
  'index': test_df['Question ID'],
  'answer': test_df['answer_tfidf']
})

output_df.to_csv('output_tfidf.csv', index=False)

[21 11 19]
[22 39 14]
[23 38 17]
[24 34 40]
[25 16 15]
[26 14 39]
[27 24 23]
[28 13 12]
[29  5 30]
[30 23 38]
[31 38 14]
[32 17  7]
[33 10 18]
[34  4 16]
[35 38 25]
[36 38 14]
[37 28 26]
[24 38 30]
[39 22  1]
[40 20 38]
