In [2]:
# !pip install torch
# !pip install transformers

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import os
import glob
import pandas as pd

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device).eval()

In [5]:
def get_bert_scores(queries_to_docnos, docs_dict, queries_dict):
    scores_dict = {}
    for qid in tqdm(queries_to_docnos, desc="BERT", total=len(queries_to_docnos)):
        for docno in queries_to_docnos[qid]:
            query_text = queries_dict[qid]
            doc_text = docs_dict[docno]
            ret = tokenizer.encode_plus(query_text,
                                        doc_text,
                                        max_length=512,
                                        truncation=True,
                                        return_token_type_ids=True,
                                        return_tensors='pt')

            with torch.cuda.amp.autocast(enabled=False):
                input_ids = ret['input_ids'].to(device)
                tt_ids = ret['token_type_ids'].to(device)
                output, = model(input_ids, token_type_ids=tt_ids, return_dict=False)
                if output.size(1) > 1:
                    score = torch.nn.functional.log_softmax(output, 1)[0, -1].item()
                else:
                    score = output.item()

            scores_dict[docno] = score

    return scores_dict

In [12]:
all_files = glob.glob(os.path.join("./", "*.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [20]:
df.head(1).drop(["_id","bonus","edittion_time"],axis=1)

Unnamed: 0,current_document,description,docno,group,position1,position2,position3,posted_document,query,query1,query2,query3,query_id,ranker,round_no,score1,score2,score3,username,variant
0,Folk remedies for soothing a sore throat have ...,What folk remedies are there for soothing a so...,ROUND01261C8DJE1B,C,20,30,20,Folk remedies for soothing a sore throat have ...,folk remedies sore throat,folk remedies sore throat,sore throat,sore throat herbal remedy,261,BERT,1,426201586379,607877588272,878544100000.0,8DJE1B,True


In [34]:
def create_data_structures(df):
    queries_dict = dict(zip(df.query_id, df.query1))
    docs_dict = dict(zip(df.docno, df.current_document))
    queries_to_docnos = {k: list(set(df[df.query_id == k].docno)) for k in set(df.query_id)}
    return queries_dict, docs_dict, queries_to_docnos

In [43]:
# queries_dict = {102: "Eurovision Song Contest", 154: "iPhone"}
# docs_dict = {"doc1": "The Eurovision Song Contest, often known simply as Eurovision or by its initialism ESC, is an international song competition organised annually by the European Broadcasting Union", 
#              "doc2": "London, the capital of England and the United Kingdom, is a 21st-century city with history stretching back to Roman times. At its centre stand the imposing Houses of Parliament, the iconic ‘Big Ben’ clock tower and Westminster Abbey, site of British monarch coronations. Across the Thames River, the London Eye observation wheel provides panoramic views of the South Bank cultural complex, and the entire city.", 
#              "doc3": "we like pizza.",
#              "doc4": "An iPhone is a line of smartphones developed and sold by Apple Inc., offering a range of advanced features, sleek design, and seamless integration with Apple's ecosystem, including access to the App Store, iCloud services, and exclusive software like Siri, making it a popular choice among smartphone users worldwide."}

# queries_to_docnos = {102: ["doc1", "doc3"], 154: ["doc2", "doc4"]}
queries_dict, docs_dict, queries_to_docnos = create_data_structures(df[df.round_no == 5].drop_duplicates(subset=['current_document']))
scores = get_bert_scores(queries_to_docnos, docs_dict, queries_dict)          

BERT: 100%|███████████████████████████████████████████████████████████████████████████| 30/30 [01:37<00:00,  3.25s/it]


In [46]:
# for query in queries_dict:
#     print("Query: ", queries_dict[query])
#     docs = sorted(queries_to_docnos[query], key=lambda x: -scores[x])
#     for i, docno in enumerate(docs):
#         print(f"{i+1} ---> {scores[docno]}, {docno}.\n{docs_dict[docno]}\n")
#     print()

In [47]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import random

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(4, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Generate a batch of queries
queries = list(set(df.query1))

# Define the initial weights
initial_weights = torch.tensor([0.5, 0.5, 0.5, 0.5], dtype=torch.float32)

# Function to generate text using the weights
def generate_text(weights):
    # Implement your logic to generate text using the given weights
    # pass
    return "test text"

# Function to calculate similarity score between a query and generated text
def calculate_similarity(query, text):
    # Implement your logic to calculate similarity score
    pass

# Create the neural network model
model = Net()

# Use an optimizer to minimize the negative average score
optimizer = optim.Adam(model.parameters())

# Random train-test split
random.seed(42)
random.shuffle(queries)
train_queries = queries[:20]
val_queries = queries[20:]

# Lists to store losses
train_losses = []
val_losses = []

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()

    # Generate texts using the initial weights for train queries
    train_texts = [generate_text(initial_weights) for _ in range(len(train_queries))]

    # Calculate similarity scores for train queries
    train_scores = [calculate_similarity(q, t) for q, t in zip(train_queries, train_texts)]

    # Calculate negative average score as the train loss
    train_loss = -torch.mean(torch.tensor(train_scores))
    train_losses.append(train_loss.item())

    # Generate texts using the initial weights for validation queries
    val_texts = [generate_text(initial_weights) for _ in range(len(val_queries))]

    # Calculate similarity scores for validation queries
    val_scores = [calculate_similarity(q, t) for q, t in zip(val_queries, val_texts)]

    # Calculate negative average score as the validation loss
    val_loss = -torch.mean(torch.tensor(val_scores))
    val_losses.append(val_loss.item())

    # Backpropagation and weight update for train loss
    train_loss.backward()
    optimizer.step()

    # Update the initial weights for validation loss
    with torch.no_grad():
        initial_weights -= optimizer.param_groups[0]['lr'] * initial_weights.grad
        initial_weights.grad.zero_()

    # Print progress
    if (epoch + 1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item()}, Val Loss: {val_loss.item()}")

# Get the optimized weights
optimized_weights = initial_weights

# Plotting the losses
epochs = range(1, num_epochs + 1)
plt.plot(epochs, train_losses, label='Train Loss')
plt.plot(epochs, val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

print("Optimized Weights:", optimized_weights)
