In [None]:
import warnings
warnings.filterwarnings('ignore')
import pyterrier as pt

if not pt.java.started():
    pt.java.init()

import os
import torch
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', False)


dataset_name = 'msmarco-passage'
eval_ds_name = 'dev'
dataset = pt.get_dataset(f'irds:{dataset_name}')
eval_dataset = pt.get_dataset(f'irds:{dataset_name}/{eval_ds_name}')
topics = eval_dataset.get_topics()
qrels = eval_dataset.get_qrels()

work_name = "retrievability-bias"
root_dir = f'/root/{work_name}'
nfs_save = f'/nfs/datasets/cxj/{work_name}'
if not os.path.exists(nfs_save):
    os.makedirs(nfs_save)

In [None]:
# !pip install transformers sentence-transformers

In [108]:
topics.head(2)

Unnamed: 0,qid,query
0,1048578,cost of endless pools swim spa
1,1048579,what is pcnt


In [None]:
df = pd.DataFrame(dataset.get_corpus_iter(verbose=True))

In [None]:
topics.columns.to_list()

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
# Load the model and tokenizer
model_name = "facebook/contriever-msmarco"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)  # Move model to GPU

In [None]:
# Function to encode text into dense vectors
# def encode_texts(texts):
#     # Tokenize and move input tensors to GPU
#     inputs = tokenizer.batch_encode_plus(texts, padding=True, truncation=True, return_tensors="pt").to(device)
#     print(type(inputs))
#     print(inputs)
#     with torch.no_grad():
#         # Generate embeddings and move them back to CPU for further processing
#         embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu()
#     return embeddings

In [None]:
def calc_embeddings(inputs):
    # Tokenize the inputs
    tokenized_inputs = tokenizer(
        inputs,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)  # Move tokenized inputs to GPU
    
    # print(tokenized_inputs)
    # Compute embeddings
    with torch.no_grad():
        model_output = model(**tokenized_inputs)
        token_embeddings = model_output.last_hidden_state 
    
    # Aggregate embeddings (e.g., mean pooling)
    attention_mask = tokenized_inputs["attention_mask"]  # To ignore padding tokens in the aggregation
    masked_embeddings = token_embeddings * attention_mask.unsqueeze(-1)  # Apply attention mask
    sum_embeddings = masked_embeddings.sum(dim=1)  # Sum over the sequence length
    sum_mask = attention_mask.sum(dim=1).unsqueeze(-1)  # Count non-padding tokens per sequence
    inputs_embeddings = sum_embeddings / sum_mask  # Mean pooling: divide by token counts
    
    # Display final embeddings (shape: batch_size, hidden_dim)
    print(inputs_embeddings.shape)  # Example: torch.Size([3, 768])
    # print(inputs_embeddings)  # Example: torch.Size([3, 768])
    return inputs_embeddings

In [None]:
# Define query and documents
queries = ["What is the capital of France?", "What is the capital of the UK?", "What is the capital of China?", "What is the capital of the United States?"]
documents = [
    "Paris is the capital city of France.",
    "France is a country in Europe.",
    "Berlin is the capital of Germany.",
    "Madrid is the capital of Spain.",
    "Paris is known for the Eiffel Tower.",
    "The Louvre is located in Paris, France.",
    "London is the capital of the United Kingdom.",
    "Rome is the capital of Italy.",
    "Paris is famous for its cuisine.",
    "The French language is spoken in Paris.",
]

In [None]:
queries = topics[:10]['query'].to_list()
documents = df[:100]['text'].to_list()

In [99]:
queries

['cost of endless pools swim spa',
 'what is pcnt',
 'what is pcb waste',
 'what is pbis',
 'what is paysky',
 'what is paydata',
 'what is pay range for warehouse specialist in minneapolis',
 'what is paula deen s brother',
 'what is paul gum disease',
 'what is patron']

In [None]:
# Convert to numpy (required by scikit-learn)

query_embeddings = calc_embeddings(queries).cpu().numpy()
document_embeddings = calc_embeddings(documents).cpu().numpy()

# Compute cosine similarities (shape: num_queries x num_docs)
cos_sim_matrix = cosine_similarity(query_embeddings, document_embeddings)

# Display the similarity matrix
# print("Cosine Similarity Matrix:")
# print(cos_sim_matrix)

# Retrieve the top-k documents for each query
top_k = 10
for i, query_similarities in enumerate(cos_sim_matrix):
    top_indices = query_similarities.argsort()[-top_k:][::-1]
    print(f"Query {i}: {queries[i]}")
    print(f"Top {top_k} documents for Query {i}:")
    top_scores = [query_similarities[idx] for idx in top_indices]
    print(top_scores)
    for idx in top_indices:
        print(f"  Document {idx}: Score = {query_similarities[idx]:.4f}")

In [None]:
for i in range(0, topics.shape[0],batch_size):
    queries = topics[i: i+batch_size]['text'].to_list():
    sub_df = df['text'].to_list()

In [102]:
df[0:5]['text'].to_list()

Unnamed: 0,text,docno
0,The presence of communication amid scientific minds was equally important to the success of the Manhattan Project as scientific intellect was. The only cloud hanging over the impressive achievement of the atomic researchers and engineers is what their success truly meant; hundreds of thousands of innocent lives obliterated.,0
1,The Manhattan Project and its atomic bomb helped bring an end to World War II. Its legacy of peaceful uses of atomic energy continues to have an impact on history and science.,1
2,Essay on The Manhattan Project - The Manhattan Project The Manhattan Project was to see if making an atomic bomb possible. The success of this project would forever change the world forever making it known that something this powerful can be manmade.,2
3,"The Manhattan Project was the name for a project conducted during World War II, to develop the first atomic bomb. It refers specifically to the period of the project from 194 … 2-1946 under the control of the U.S. Army Corps of Engineers, under the administration of General Leslie R. Groves.",3
4,"versions of each volume as well as complementary websites. The first website–The Manhattan Project: An Interactive History–is available on the Office of History and Heritage Resources website, http://www.cfo. doe.gov/me70/history. The Office of History and Heritage Resources and the National Nuclear Security",4


In [107]:
df.loc[4,'docno']

'4'

In [None]:
# batch_size = 4
# for i in range(0, len(queries), batch_size):
#     batch = queries[i:i+batch_size]
#     tokenized_batch = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
#     print(tokenized_batch)

In [None]:
# # Encode query and documents
# query_embedding = encode_texts([query])  # Query embedding
# document_embeddings = encode_texts(documents)  # Document embeddings

# # Compute cosine similarity
# cos_sim = cosine_similarity(query_embedding, document_embeddings)
# print(cos_sim.shape)
# print(cos_sim)

# # Rank documents by similarity
# top_k = 10
# top_indices = cos_sim[0].argsort()[-top_k:][::-1]
# # print(type(top_indices))
# # print(top_indices.shape)
# # print(cos_sim[0].argsort())
# # print(cos_sim[0].argsort()[-top_k:])
# # print(cos_sim[0].argsort()[-top_k:][::-1])


# print("Top 10 results:")
# for idx in top_indices:
#     print(f"{documents[idx]} (Score: {cos_sim[0][idx]:.4f})")
    # print(cos_sim[0].shape)
    # print(f"{documents[idx]} Score: {cos_sim[0][idx]}")

In [None]:
print(query_embedding.shape)

In [None]:
import numpy as np
a = pd.DataFrame(np.array([[1, 2], [3, 4], [5, 6], [7, 8]]), columns=['x', 'y'], dtype=float)
print('===a===')
print(a)
b = pd.DataFrame(np.array([[10, 20], [30, 40]]), columns=['x', 'y'], dtype=float)

print('===b===')
print(b)
# 不重置索引,上下拼接
# df = pd.concat([a, b], axis=0, join='inner', ignore_index=True)
# print('===df===')
# print(df)
# m,n = a.shape
# m0,n0 = b.shape

from sklearn.metrics.pairwise import cosine_similarity

r = cosine_similarity(b, a)



In [None]:
print(type(r))
print(r.shape)
print(r)

In [112]:
# Function to encode text into dense vectors
def encode_texts(texts):
    # Tokenize and move input tensors to GPU
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        # Generate embeddings and move them back to CPU for further processing
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu()
    return embeddings

# Define query and documents
query = "What is the capital of France?"
documents = [
    "Paris is the capital city of France.",
    "France is a country in Europe.",
    "Berlin is the capital of Germany.",
    "Madrid is the capital of Spain.",
    "Paris is known for the Eiffel Tower.",
    "The Louvre is located in Paris, France.",
    "London is the capital of the United Kingdom.",
    "Rome is the capital of Italy.",
    "Paris is famous for its cuisine.",
    "The French language is spoken in Paris.",
]

# Encode query and documents
query_embedding = encode_texts([query])  # Query embedding
document_embeddings = encode_texts(documents)  # Document embeddings

# Compute cosine similarity
cos_sim = cosine_similarity(query_embedding, document_embeddings)

# Rank documents by similarity
top_k = 10
top_indices = cos_sim[0].argsort()[-top_k:][::-1]

# Display top-k results
print("Top 10 results:")
for idx in top_indices:
    print(f"{documents[idx]} (Score: {cos_sim[0][idx]:.4f})")

Top 10 results:
Paris is the capital city of France. (Score: 0.8204)
The Louvre is located in Paris, France. (Score: 0.5938)
Paris is known for the Eiffel Tower. (Score: 0.5878)
London is the capital of the United Kingdom. (Score: 0.5868)
The French language is spoken in Paris. (Score: 0.5623)
France is a country in Europe. (Score: 0.5574)
Rome is the capital of Italy. (Score: 0.5088)
Berlin is the capital of Germany. (Score: 0.5031)
Paris is famous for its cuisine. (Score: 0.5025)
Madrid is the capital of Spain. (Score: 0.4875)
