## cos_rank.ipynb
This notebook implements the following tasks:
1. Loads precomputed document and query embeddings from pickle files.
2. Computes cosine similarity between queries and documents using `sklearn.metrics.pairwise.cosine_similarity`.
3. Retrieves the top 10 most relevant documents for each query based on similarity scores.
4. Prepares a submission file in CSV format containing query IDs and the corresponding top document IDs.

### Output
- A submission file (`submission.csv`) is generated, containing query IDs and the top 10 document IDs for each query.

In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Vector for loading documents and queries
with open('../pkl/m3_chunk_128/m3_chunk_128_embedding.pkl', 'rb') as f:
    doc_embeddings_dict = pickle.load(f)

# Extract document IDs and corresponding vectors
doc_ids = list(doc_embeddings_dict.keys())
doc_embeddings = np.array([doc_embeddings_dict[doc_id] for doc_id in doc_ids])
del doc_embeddings_dict

In [None]:
with open('../pkl//m3_chunk_128/m3_query_embedding.pkl', 'rb') as f:
    query_embeddings_dict = pickle.load(f)

# Extract the query ID and corresponding vector
query_ids = list(query_embeddings_dict.keys())
query_embeddings = np.array([query_embeddings_dict[query_id] for query_id in query_ids])
del query_embeddings_dict

In [None]:
# Calculate the similarity between a query and a document
similarity_scores = cosine_similarity(query_embeddings, doc_embeddings)

In [None]:
# similarity_scores.shape 2000，268022

In [None]:
# Get the 10 most relevant documents for each query
top_k = 10
# Returns an indexed array of input arrays sorted in ascending order.
retrieved_docs = np.argsort(-similarity_scores, axis=1)[:, :top_k]  # 对相似度从大到小排序，并取前10个文档的索引
retrieved_docs.shape

In [None]:
retrieved_docs[0]

In [None]:
# Get the corresponding docid
top_k_doc_ids = [[doc_ids[idx] for idx in retrieved_docs[i]] for i in range(len(retrieved_docs))]

In [None]:
id = 0
submission = []
for i in range(len(query_ids)):
    top_docs = top_k_doc_ids[i]
    # Ensure that there are 10 document IDs in the predicted_docs column, separated by spaces
    submission.append({
        'id': id,
        'docids': str(' '.join(top_docs).split())
    })
    id = id + 1

# submission.csv
submission_df = pd.DataFrame(submission)
submission_df.to_csv('submission.csv', index=False)