In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
import pandas as pd
from sentence_transformers import InputExample
import random
from datasets import Dataset
from sentence_transformers import SentenceTransformer, losses, models, InputExample, util
from sentence_transformers.util import cos_sim
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from torch.utils.data import DataLoader
import ast
from collections import defaultdict
from sklearn.model_selection import train_test_split
import torch

In [3]:
queries_df = pd.read_csv("https://raw.githubusercontent.com/Ron-DS-AI/Information_Retrieval/refs/heads/main/qrels/queries_rnd3_stacked.csv")
judgments_df = pd.read_csv("https://raw.githubusercontent.com/Ron-DS-AI/Information_Retrieval/refs/heads/main/qrels/qrels.csv")

In [4]:
METADATA_FILES = [
    "https://github.com/Ron-DS-AI/Information_Retrieval/raw/refs/heads/main/corpus_data/metadata_final_pt1.csv",
    "https://github.com/Ron-DS-AI/Information_Retrieval/raw/refs/heads/main/corpus_data/metadata_final_pt2.csv"
]

In [5]:
def load_corpus():
    meta_dfs = []
    tag_frequency = defaultdict(int)

    # Load and process metadata
    for url in METADATA_FILES:
        df = pd.read_csv(url, index_col='cord_uid')

        # Parse tag lists
        df['tags'] = df['tags'].apply(
            lambda x: ast.literal_eval(x) if pd.notnull(x) and x.startswith('[') else []
        )

        # Parse publish_time
        df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce', dayfirst=True)

        # Count tags
        for tags in df['tags']:
            for tag in tags:
                tag_frequency[tag] += 1

        meta_dfs.append(df)

    metadata = pd.concat(meta_dfs)
    return metadata

In [6]:
corpus_df = load_corpus()

  df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce', dayfirst=True)


In [7]:
corpus_df = corpus_df[corpus_df["summarised_abstracts"].notnull()]
corpus_df = corpus_df[corpus_df["summarised_abstracts"].str.strip() != ""]

In [8]:
corpus_df.reset_index(inplace=True)

In [9]:
# Clean: Keep only judgments with cord-id present in the corpus
valid_cord_ids = set(corpus_df["cord_uid"])
judgments_df = judgments_df[judgments_df["cord-id"].isin(valid_cord_ids)]

In [10]:
len(corpus_df['cord_uid'])

8574

In [11]:
len(judgments_df['cord-id'])

17715

In [12]:
corpus_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,pdf_json_files,pmc_json_files,url,s2_id,referenced_by_count,JournalName_DOI,tags,TagCount,summarised_abstracts,combined_text
0,iua8c4hy,300ff0def92740f7e1142036f9e6ae30845e13d2,medrxiv,covid-19 outbreak in oman: model-driven impact...,10.1101/2020.04.02.20050666,,,medrxiv,motivated by the rapid spread of covid-19 all ...,2020-06-04,...,document_parses/pdf_json/300ff0def92740f7e1142...,,https://doi.org/10.1101/2020.04.02.20050666,215782459.0,10,,"[methodologies or experimental designs, theore...",4,motivated by the rapid spread of covid-19 all ...,covid-19 outbreak in oman: model-driven impact...
1,xph08fwv,9a3e8c974bc68c29b0273ee771553046d7f37930,medrxiv,automated and semi-automated contact tracing: ...,10.1101/2020.04.14.20063636,,,medrxiv,introduction traditional approaches to case-fi...,NaT,...,document_parses/pdf_json/9a3e8c974bc68c29b0273...,,http://medrxiv.org/cgi/content/short/2020.04.1...,216055053.0,2,,"[literature review, methodologies or experimen...",4,introduction traditional approaches to case-fi...,automated and semi-automated contact tracing: ...
2,aosmo568,ca6949e3e039a0e4098644b0b2df90606ae3a7ee,medrxiv,efficacy of remdesivir versus placebo for the ...,10.1101/2020.04.09.20059196,,,medrxiv,background: in spite of the global containment...,NaT,...,document_parses/pdf_json/ca6949e3e039a0e409864...,,http://medrxiv.org/cgi/content/short/2020.04.0...,215782112.0,1,,"[literature review, methodologies or experimen...",4,background: in spite of the global containment...,efficacy of remdesivir versus placebo for the ...
3,ujomta30,d831dbf38025a44e0436fd11c52af4db9eb7c5b0,medrxiv,hydroxychloroquine versus covid-19: a rapid sy...,10.1101/2020.04.14.20065276,,,medrxiv,background: coronavirus disease 2019 (covid-19...,NaT,...,document_parses/pdf_json/d831dbf38025a44e0436f...,,http://medrxiv.org/cgi/content/short/2020.04.1...,216035656.0,15,,"[literature review, methodologies or experimen...",4,background: coronavirus disease 2019 (covid-19...,hydroxychloroquine versus covid-19: a rapid sy...
4,xwlzq3m3,44eec5c0f84d7069033431dcb5a6adeb01b8ee0c,elsevier; medline; pmc,clinical characteristics and diagnostic challe...,10.1016/j.jfma.2020.04.007,pmc7161491,32307322.0,els-covid,background/purpose current studies on pediatri...,NaT,...,document_parses/pdf_json/44eec5c0f84d706903343...,document_parses/pmc_json/pmc7161491.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32307322/;...,215787843.0,157,journal of the formosan medical association,"[literature review, methodologies or experimen...",4,background/purpose current studies on pediatri...,clinical characteristics and diagnostic challe...


In [13]:
corpus_df['combined_sum_text'] = corpus_df['title'] + " [SEP] " + corpus_df['summarised_abstracts']

In [14]:
corpus_df.reset_index(inplace=True)

In [15]:
doc_texts = corpus_df[["cord_uid", "combined_sum_text"]]

In [16]:
judgments = judgments_df.merge(doc_texts, left_on="cord-id", right_on="cord_uid", how="left")

In [17]:
judgments.head()

Unnamed: 0,topic-id,iteration,cord-id,judgement,cord_uid,combined_sum_text
0,1,0.5,010vptx3,2,010vptx3,"the sars, mers and novel coronavirus (covid-19..."
1,1,1.0,02f0opkr,1,02f0opkr,an outbreak of covid19 caused by a new coronav...
2,1,1.0,04ftw7k9,0,04ftw7k9,current issue in tourism: the evolution of tra...
3,1,1.0,05qglt1f,0,05qglt1f,full spectrum of covid-19 severity still being...
4,1,1.0,0604jed8,0,0604jed8,chapter 4 human viruses: emergence and evoluti...


In [18]:
unique_topic_ids = judgments["topic-id"].unique()

In [19]:
# Split topic-ids
train_ids, temp_ids = train_test_split(unique_topic_ids, test_size=0.3, random_state=42)
val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)

In [20]:
# Split into docs into train, val, test, ensuring no overlap of topic ids across sets
train_judgments = judgments[judgments["topic-id"].isin(train_ids)]
val_judgments   = judgments[judgments["topic-id"].isin(val_ids)]
test_judgments  = judgments[judgments["topic-id"].isin(test_ids)]

In [25]:
# Split into queries into train, val, test, ensuring no overlap of topic ids across sets
train_queries = queries_df[queries_df["topic-id"].isin(train_ids)]
val_queries   = queries_df[queries_df["topic-id"].isin(val_ids)]
test_queries  = queries_df[queries_df["topic-id"].isin(test_ids)]

In [21]:
# Load SBERT base model
base_model = SentenceTransformer("all-mpnet-base-v2", device='cuda')
print(torch.cuda.is_available())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

True


In [22]:
# Get unique docs from training judgments
train_docs = train_judgments[["cord-id", "combined_sum_text"]].drop_duplicates()
doc_texts = train_docs["combined_sum_text"].tolist()
doc_ids = train_docs["cord-id"].tolist()

In [23]:
# Encode all docs
doc_embeddings = base_model.encode(doc_texts, convert_to_tensor=True, batch_size=32, show_progress_bar=True)
doc_id_to_embedding = dict(zip(doc_ids, doc_embeddings))

Batches:   0%|          | 0/216 [00:00<?, ?it/s]

## **Triplet Sampling with Hard Negatives**

In [24]:
def build_triplets_with_hard_negatives(judgments_df, queries_df, k=3):
    topic_to_queries = queries_df.groupby("topic-id")["query"].apply(list).to_dict()
    triplets = []

    for topic_id, group in judgments_df.groupby("topic-id"):
        if topic_id not in topic_to_queries:
            continue

        positives = group[group["judgement"] > 0]
        negatives = group[group["judgement"] == 0]

        if positives.empty or negatives.empty:
            continue

        # Get embeddings for negatives
        neg_ids = negatives["cord-id"].tolist()
        neg_embeddings = torch.stack([doc_id_to_embedding[cid] for cid in neg_ids if cid in doc_id_to_embedding])

        for query in topic_to_queries[topic_id]:
            query_embedding = base_model.encode(query, convert_to_tensor=True)

            # Get top-k similar negatives = hard negatives
            scores = cos_sim(query_embedding, neg_embeddings)[0]
            top_k_indices = torch.topk(scores, k=min(k, len(scores)), largest=True).indices.tolist()
            hard_neg_samples = negatives.iloc[top_k_indices]

            for _, pos_row in positives.iterrows():
                for _, neg_row in hard_neg_samples.iterrows():
                    triplets.append(InputExample(
                        texts=[
                            query,
                            str(pos_row["combined_sum_text"]),
                            str(neg_row["combined_sum_text"])
                        ]
                    ))

    return triplets

In [26]:
train_triplets = build_triplets_with_hard_negatives(train_judgments, train_queries)

In [28]:
# Load SBERT base model
model = SentenceTransformer("all-mpnet-base-v2", device='cuda')
print(torch.cuda.is_available())

True


In [29]:
# Training triplets loader
train_dataloader = DataLoader(train_triplets, shuffle=True, batch_size=16)

# Triplet loss
train_loss = losses.TripletLoss(model)

## Set up IR Validation Evaluator

In [30]:
topic_to_queries = (
    queries_df[queries_df["topic-id"].isin(val_judgments["topic-id"].unique())]
    .groupby("topic-id")[["query-id", "query"]]
    .apply(lambda df: list(zip(df["query-id"], df["query"])))
    .to_dict()
)

In [31]:
val_queries_dict = {
    qid: qtext
    for query_list in topic_to_queries.values()
    for qid, qtext in query_list
}

In [32]:
topic_rels = (
    val_judgments[val_judgments["judgement"] > 0]
    .groupby("topic-id")["cord-id"]
    .apply(list)
    .to_dict()
)

In [33]:
val_relevant_docs = {
    qid: topic_rels[tid]
    for tid, qlist in topic_to_queries.items()
    for qid, _ in qlist
    if tid in topic_rels
}

In [34]:
val_doc_ids = val_judgments["cord-id"].unique()
val_corpus_df = corpus_df[corpus_df["cord_uid"].isin(val_doc_ids)]
val_corpus_dict = dict(zip(val_corpus_df["cord_uid"], val_corpus_df["combined_sum_text"]))

In [35]:
val_evaluator = InformationRetrievalEvaluator(
    queries=val_queries_dict,
    corpus=val_corpus_dict,
    relevant_docs=val_relevant_docs,
    name="val",
    show_progress_bar=True
)

## Train Model

In [36]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=val_evaluator,
    evaluation_steps=500,
    epochs=3,
    warmup_steps=100,
    output_path="output_hard_negatives/",
    save_best_model=True,
    show_progress_bar=True
)

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mstephcedwards[0m ([33mstephcedwards-queen-mary-university-of-london[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Val Cosine Accuracy@1,Val Cosine Accuracy@3,Val Cosine Accuracy@5,Val Cosine Accuracy@10,Val Cosine Precision@1,Val Cosine Precision@3,Val Cosine Precision@5,Val Cosine Precision@10,Val Cosine Recall@1,Val Cosine Recall@3,Val Cosine Recall@5,Val Cosine Recall@10,Val Cosine Ndcg@10,Val Cosine Mrr@10,Val Cosine Map@100
500,3.8945,No log,0.533333,0.8,0.8,0.866667,0.533333,0.511111,0.426667,0.44,0.003394,0.009687,0.014055,0.029813,0.449079,0.653968,0.142422
1000,3.4286,No log,0.6,0.8,0.8,0.8,0.6,0.422222,0.373333,0.313333,0.004187,0.008663,0.012391,0.021021,0.354655,0.666667,0.06867
1500,3.3713,No log,0.4,0.733333,0.733333,0.8,0.4,0.4,0.426667,0.34,0.00252,0.007669,0.014178,0.02264,0.36464,0.577778,0.078693
1821,3.3713,No log,0.266667,0.733333,0.8,0.8,0.266667,0.311111,0.306667,0.353333,0.002033,0.00634,0.010314,0.023687,0.342007,0.505556,0.073285
2000,3.3154,No log,0.466667,0.733333,0.733333,0.8,0.466667,0.377778,0.373333,0.313333,0.002995,0.007975,0.012926,0.021913,0.336766,0.595556,0.074118
2500,3.3154,No log,0.333333,0.8,0.866667,0.866667,0.333333,0.4,0.4,0.34,0.001881,0.008316,0.014703,0.024549,0.348341,0.538889,0.082533
3000,3.3079,No log,0.333333,0.666667,0.733333,0.933333,0.333333,0.355556,0.32,0.32,0.002231,0.007077,0.010491,0.021708,0.32555,0.534524,0.065532
3500,3.2885,No log,0.333333,0.6,0.733333,1.0,0.333333,0.288889,0.24,0.266667,0.002017,0.006111,0.008698,0.020608,0.276062,0.523968,0.05108
3642,3.2885,No log,0.266667,0.533333,0.8,0.933333,0.266667,0.288889,0.293333,0.26,0.001542,0.005949,0.010208,0.019892,0.263231,0.453492,0.048252
4000,3.288,No log,0.133333,0.466667,0.8,0.933333,0.133333,0.244444,0.24,0.226667,0.000978,0.005056,0.009048,0.01787,0.216349,0.358519,0.035418


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.46s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.47s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.46s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.45s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.41s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.41s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.42s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.38s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.40s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.42s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.44s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.41s/it]


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:16<00:00, 16.43s/it]


In [37]:
!ls /content/output_hard_negatives/

1_Pooling			   README.md
2_Normalize			   sentence_bert_config.json
config.json			   special_tokens_map.json
config_sentence_transformers.json  tokenizer_config.json
eval				   tokenizer.json
model.safetensors		   vocab.txt
modules.json


In [38]:
!du -sh /content/output_hard_negatives/

419M	/content/output_hard_negatives/


In [40]:
!zip -r fine_tuned_sbert_hard_neg_mining.zip output_hard_negatives/

  adding: output_hard_negatives/ (stored 0%)
  adding: output_hard_negatives/config.json (deflated 47%)
  adding: output_hard_negatives/sentence_bert_config.json (deflated 4%)
  adding: output_hard_negatives/config_sentence_transformers.json (deflated 34%)
  adding: output_hard_negatives/tokenizer_config.json (deflated 75%)
  adding: output_hard_negatives/special_tokens_map.json (deflated 85%)
  adding: output_hard_negatives/tokenizer.json (deflated 71%)
  adding: output_hard_negatives/eval/ (stored 0%)
  adding: output_hard_negatives/eval/Information-Retrieval_evaluation_val_results.csv (deflated 64%)
  adding: output_hard_negatives/model.safetensors (deflated 8%)
  adding: output_hard_negatives/2_Normalize/ (stored 0%)
  adding: output_hard_negatives/README.md (deflated 70%)
  adding: output_hard_negatives/modules.json (deflated 62%)
  adding: output_hard_negatives/1_Pooling/ (stored 0%)
  adding: output_hard_negatives/1_Pooling/config.json (deflated 57%)
  adding: output_hard_negati

## **Ranking Evaluation**

In [41]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("output_hard_negatives/")

In [42]:
# Group queries by topic-id
topic_to_queries = (
    queries_df[queries_df["topic-id"].isin(test_judgments["topic-id"].unique())]
    .groupby("topic-id")[["query-id", "query"]]
    .apply(lambda df: list(zip(df["query-id"], df["query"])))
    .to_dict()
)

# query-id → query text
test_queries_dict = {
    qid: qtext
    for query_list in topic_to_queries.values()
    for qid, qtext in query_list
}

# topic-id → list of relevant doc IDs
topic_rels = (
    test_judgments[test_judgments["judgement"] > 0]
    .groupby("topic-id")["cord-id"]
    .apply(list)
    .to_dict()
)

# query-id → list of relevant doc IDs (copied from topic rels)
test_relevant_docs = {
    qid: topic_rels[tid]
    for tid, qlist in topic_to_queries.items()
    for qid, _ in qlist
    if tid in topic_rels
}

In [43]:
# cord-id → combined_sum_text
test_doc_ids = test_judgments["cord-id"].unique()
test_corpus_df = corpus_df[corpus_df["cord_uid"].isin(test_doc_ids)]
test_corpus_dict = dict(zip(test_corpus_df["cord_uid"], test_corpus_df["combined_sum_text"]))

In [44]:
evaluator = InformationRetrievalEvaluator(
    queries=test_queries_dict,
    corpus=test_corpus_dict,
    relevant_docs=test_relevant_docs,
    name="test",
    show_progress_bar=True
)

In [45]:
# Run the evaluation
scores = evaluator(model, output_path=None)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Corpus Chunks: 100%|██████████| 1/1 [00:14<00:00, 14.63s/it]


In [46]:
metrics_df = pd.DataFrame([scores])
metrics_df = metrics_df.T.reset_index()
metrics_df.columns = ['Metric', 'Value']

In [47]:
metrics_df.sort_values("Metric", inplace=True)
metrics_df

Unnamed: 0,Metric,Value
0,test_cosine_accuracy@1,0.555556
3,test_cosine_accuracy@10,0.944444
1,test_cosine_accuracy@3,0.777778
2,test_cosine_accuracy@5,0.888889
14,test_cosine_map@100,0.103467
13,test_cosine_mrr@10,0.68858
12,test_cosine_ndcg@10,0.425985
4,test_cosine_precision@1,0.555556
7,test_cosine_precision@10,0.4
5,test_cosine_precision@3,0.462963


## **Save fine-tuned SBERT model**

In [48]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `Marketing Model Dev` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might hav

In [49]:
from sentence_transformers import SentenceTransformer

In [50]:
model = SentenceTransformer("output_hard_negatives/")  # trained model path

In [53]:
model.push_to_hub("StephKeddy/sbert-IR-covid-search-v2")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/StephKeddy/sbert-IR-covid-search-v2/commit/60410542223bf6407468d6e02fabee6f8a91d2bb'

## **Indexing corpus**

Topic-ids used for training have been removed for building final deployed IR search engine

In [55]:
smodel = SentenceTransformer("StephKeddy/sbert-IR-covid-search-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/205 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/26.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [78]:
train_doc_ids = train_judgments["cord-id"].unique()

In [79]:
filtered_corpus_df = corpus_df[~corpus_df["cord_uid"].isin(train_doc_ids)]

In [80]:
filtered_corpus_df = filtered_corpus_df.dropna(subset=["combined_sum_text"])
filtered_corpus_df["combined_sum_text"] = filtered_corpus_df["combined_sum_text"].astype(str)

In [81]:
texts = filtered_corpus_df["combined_sum_text"].tolist()
ids = filtered_corpus_df["cord_uid"].tolist()

In [58]:
import numpy as np

In [82]:
embeddings = model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)

Batches:   0%|          | 0/53 [00:00<?, ?it/s]

In [83]:
embedding_df = pd.DataFrame(embeddings, index=ids)
embedding_df.columns = [str(i) for i in range(embedding_df.shape[1])]
embedding_df.index.name = "cord_uid"
embedding_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
cord_uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sg5uwuqc,-0.001263,-0.149179,-0.021819,-0.054362,0.021815,-0.030304,0.019847,0.027123,-6.1e-05,0.030171,...,-0.056668,-0.037525,0.004336,-0.045513,-0.044402,0.008021,0.050045,-0.090546,-0.049929,-0.016833
fpjhkb4g,-0.001269,-0.150115,-0.022728,-0.05415,0.021769,-0.029546,0.019687,0.026787,-0.000277,0.030506,...,-0.056406,-0.037527,0.004756,-0.04571,-0.04378,0.007634,0.050603,-0.090579,-0.049902,-0.017017
eylpfrj3,-0.001487,-0.149395,-0.022791,-0.053842,0.022188,-0.02952,0.019577,0.026933,-0.000307,0.03091,...,-0.056443,-0.038163,0.004759,-0.045207,-0.044786,0.008269,0.050662,-0.089695,-0.050844,-0.016679
ve1fgnyg,-0.001214,-0.15057,-0.022694,-0.054216,0.022071,-0.029345,0.019301,0.026727,-0.00088,0.030536,...,-0.056053,-0.037454,0.004785,-0.045789,-0.043728,0.007828,0.050598,-0.090646,-0.049697,-0.01675
5vu2rerf,-0.001436,-0.150097,-0.022739,-0.053735,0.02182,-0.029774,0.01918,0.026595,-0.000288,0.03044,...,-0.056113,-0.037791,0.004968,-0.045455,-0.043695,0.007782,0.050307,-0.090698,-0.04988,-0.016982


In [84]:
len(embedding_df)

1671

In [85]:
half = len(embedding_df) // 2

In [86]:
embedding_part1 = embedding_df.iloc[:half]
embedding_part2 = embedding_df.iloc[half:]

In [87]:
embedding_part1.to_csv("embeddings_part1.csv.gz", compression="gzip")
embedding_part2.to_csv("embeddings_part2.csv.gz", compression="gzip")

In [89]:
embedding_ids = embedding_df.index.astype(str)

In [90]:
aligned_corpus_df = corpus_df[corpus_df["cord_uid"].isin(embedding_ids)]

In [91]:
print("Embedding count: ", len(embedding_df))
print("Corpus rows after alignment: ", len(aligned_corpus_df))

Embedding count:  1671
Corpus rows after alignment:  1671


In [92]:
aligned_corpus_df.head()

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,...,pmc_json_files,url,s2_id,referenced_by_count,JournalName_DOI,tags,TagCount,summarised_abstracts,combined_text,combined_sum_text
5,sg5uwuqc,850a96675598f6a5dd20c74e84479fbdd9a8e3c0,medrxiv,is the impact of social distancing on coronavi...,10.1101/2020.04.07.20049049,,,medrxiv,epidemiologists use mathematical models to pre...,2020-10-04,...,,http://medrxiv.org/cgi/content/short/2020.04.0...,215782536.0,1,,"[methodologies or experimental designs, theore...",4,epidemiologists use mathematical models to pre...,is the impact of social distancing on coronavi...,is the impact of social distancing on coronavi...
8,fpjhkb4g,e15bc28bc337f1a90a5baa63ffd699cc5abe1621,medrxiv,estimating the number of covid-19-related infe...,10.1101/2020.04.22.20075440,,,medrxiv,background: iran is one of the countries that ...,NaT,...,,http://medrxiv.org/cgi/content/short/2020.04.2...,216128413.0,5,,"[methodologies or experimental designs, theore...",4,background: iran is one of the countries that ...,estimating the number of covid-19-related infe...,estimating the number of covid-19-related infe...
11,eylpfrj3,85e7c8f8c4897244a7a937f1cd6270401e90cee7,arxiv,regression approach for modeling covid-19 spre...,,,,arxiv,the paper studies different regression approac...,2020-02-04,...,,https://arxiv.org/pdf/2004.01489v1.pdf,214794965.0,0,error: 404 client error: not found for url: ht...,"[methodologies or experimental designs, theore...",3,the paper studies different regression approac...,regression approach for modeling covid-19 spre...,regression approach for modeling covid-19 spre...
16,ve1fgnyg,b2c73930e23bd83d4be8fe9f166318811b53f89b,medrxiv,how high and long will the covid-19 wave be a ...,10.1101/2020.04.14.20064790,,,medrxiv,background an objective: in march 2020 the sar...,NaT,...,,https://doi.org/10.1101/2020.04.14.20064790,216055281.0,1,,"[methodologies or experimental designs, theore...",3,background an objective: in march 2020 the sar...,how high and long will the covid-19 wave be a ...,how high and long will the covid-19 wave be a ...
22,5vu2rerf,db3574a611f5e832218c5f14c6f85af06370caf7,arxiv,sirnet: understanding social distancing measur...,,,,arxiv,the sars-cov-2 infectious outbreak has rapidly...,NaT,...,,https://arxiv.org/pdf/2004.10376v1.pdf,216056251.0,0,error: 404 client error: not found for url: ht...,"[methodologies or experimental designs, theore...",3,the sars-cov-2 infectious outbreak has rapidly...,sirnet: understanding social distancing measur...,sirnet: understanding social distancing measur...


In [93]:
corpus_part1 = aligned_corpus_df.iloc[:half]
corpus_part2 = aligned_corpus_df.iloc[half:]

In [97]:
len(corpus_part1)

835

In [98]:
len(corpus_part2)

836

In [99]:
corpus_part1.to_csv("metadata_part1_final.csv", index=False)
corpus_part2.to_csv("metadata_part2_final.csv", index=False)

In [69]:
judgments_df.head()

Unnamed: 0,topic-id,iteration,cord-id,judgement
0,1,0.5,010vptx3,2
1,1,1.0,02f0opkr,1
2,1,1.0,04ftw7k9,0
3,1,1.0,05qglt1f,0
4,1,1.0,0604jed8,0


In [104]:
# Remove rows where topic-id is in the training set
filtered_judgments = judgments_df[~judgments_df["topic-id"].isin(train_ids)]

In [105]:
len(filtered_judgments)

5553

In [106]:
filtered_judgments.to_csv("DEMO_test_qrels.csv", index=False)

In [101]:
final_test_queries = pd.concat([val_queries, test_queries], ignore_index=True)

In [102]:
final_test_queries

Unnamed: 0,query-id,topic-id,query,query_type
0,9,9,coronavirus in Canada,short_form
1,13,13,how does coronavirus spread,short_form
2,16,16,how long does coronavirus survive on surfaces,short_form
3,20,20,coronavirus and ACE inhibitors,short_form
4,22,22,coronavirus heart impacts,short_form
5,44,9,how has COVID-19 affected Canada,question
6,48,13,what are the transmission routes of coronavirus?,question
7,51,16,how long does coronavirus remain stable on su...,question
8,55,20,are patients taking Angiotensin-converting enz...,question
9,57,22,are cardiac complications likely in patients w...,question


In [103]:
final_test_queries.to_csv("final_test_queries.csv", index=False)