In [1]:
import pandas as pd
from elasticsearch_dsl.connections import connections
from elasticsearch_dsl import Q, Search
import os
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from tqdm.notebook import tqdm

In [5]:
df = pd.read_pickle("../inspire_classifier_dataset_2024-06-01_2024-07-31.pkl").sample(frac=1, random_state=42).reset_index(drop=True)

df.head(20)

Unnamed: 0,id,inspire_categories,category_sources,label,text
0,7543542,[Computing],[None],0,Telecom Extreme Transformation <ENDTITLE> The ...
1,7567862,[],[],0,Improving demand forecasting in open systems w...
2,7692507,[General Physics],[None],0,Induction of LTP mechanisms in dually innervat...
3,7695458,[Math and Math Physics],[arxiv],0,"Small Ramsey numbers for books, wheels, and ge..."
4,7704966,[],[],0,Anisotropy dependent spin orbit torque switchi...
5,7518703,"[Other, Computing]","[arxiv, arxiv]",0,HemoSet: The First Blood Segmentation Dataset ...
6,7632345,[Computing],[arxiv],0,Reducing the Memory Footprint of 3D Gaussian S...
7,7659434,[Computing],[arxiv],0,"""There Has To Be a Lot That We're Missing"": Mo..."
8,7522006,[Instrumentation],[None],0,High-efficiency vertically emitting coupler fa...
9,7705317,[Condensed Matter],[arxiv],0,Effect of electronic correlation on topologica...


In [145]:
class LiteratureSearch(Search):
    connection_holdingpen = connections.create_connection(
        hosts=["https://os-inspire-legacy-os1.cern.ch/es"],
        timeout=30,
        http_auth=(os.environ["ES_USERNAME"], os.environ["ES_PASSWORD"]),
        verify_certs=False,
        use_ssl=True,
    )
    connection_inspirehep = connections.create_connection(
        hosts=["https://os-inspire-prod.cern.ch/es"],
        timeout=30,
        http_auth=(os.environ["ES_USERNAME"], os.environ["ES_PASSWORD"]),
        verify_certs=False,
        use_ssl=True,
    )

    def __init__(self, index, **kwargs):
        if index == "holdingpen-hep":
            connection = LiteratureSearch.connection_holdingpen
        else:
            connection = LiteratureSearch.connection_inspirehep
        super().__init__(
            using=kwargs.get("using", connection),
            index=index,
        )

labels = ["rejected", "non_core", "core"]
df_test = pd.read_pickle("../classifier/data/test_data_corrected.df")
df_train_valid = pd.read_pickle("../classifier/data/train_valid_data.df")
DF_TO_CORRECT = df_test.copy()



df_rejected = DF_TO_CORRECT[DF_TO_CORRECT["label"] == 0]
df_rejected_sample = df_rejected.sample(frac=1, random_state=42).reset_index(drop=True)



In [146]:

search_next = LiteratureSearch("holdingpen-hep")
search_hep = LiteratureSearch("records-hep")

counter = 0
for index, row in tqdm(df_rejected_sample.iterrows(), total=df_rejected_sample.shape[0]):
    document_id = row["id"]
    search_result = search_next.query(
        Q("match", _id=document_id) &
        Q("exists", field="metadata.arxiv_eprints")
    ).execute().hits
    if search_result:
        arxiv_eprints = search_result.hits[0]["_source"]["_extra_data"]["source_data"]["data"]["arxiv_eprints"]
        arxiv_ids = [arxiv["value"] for arxiv in arxiv_eprints]
        search_result_hep = search_hep.query(
            Q("terms", arxiv_eprints__value=arxiv_ids)
        ).execute().hits
        if search_result_hep:
            counter += 1
            print("arxiv_id", arxiv_ids)
            for hit in search_result_hep.hits:
                id = hit["_source"]["control_number"]
                DF_TO_CORRECT.loc[DF_TO_CORRECT['id']==row.id, 'id'] = int(id)
                print("id", id)
                if "core" in hit["_source"] and hit["_source"]["core"]:
                    print("core")
                    DF_TO_CORRECT.loc[DF_TO_CORRECT['id']==row.id, 'label'] = 2
                    
                else:
                    print("No core")
                    DF_TO_CORRECT.loc[DF_TO_CORRECT['id']==row.id, 'label'] = 1
                    
                print("------")
print(f"Out of {df_rejected_sample.shape[0]} rejected records, {counter} were found in the holdingpen-hep index.")
DF_TO_CORRECT.to_pickle("../classifier/data/test_data_corrected.df")        

  0%|          | 0/17531 [00:00<?, ?it/s]

arxiv_id ['2005.12653']
id 1797938
No core
------
arxiv_id ['1912.07272']
id 1770924
No core
------
arxiv_id ['1910.06224']
id 1827639
No core
------
arxiv_id ['0809.5043']
id 797925
core
------
arxiv_id ['2001.02897']
id 1774701
core
------
arxiv_id ['2011.10412']
id 1831959
core
------


KeyboardInterrupt: 

: 