# Local Embedding Model

In [12]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [13]:
embedding_model = SentenceTransformer('BAAI/bge-m3')

In [14]:
sentence_1 = ["What is BGE M3?", "Defination of BM25"]
sentences_2 = ["BGE M3 is an embedding model supporting dense retrieval, lexical matching and multi-vector interaction.", 
               "BM25 is a bag-of-words retrieval function that ranks a set of documents based on the query terms appearing in each document"]

In [15]:
output = embedding_model.encode(sentence_1)
output

array([[-0.03411698, -0.04707836, -0.00089452, ...,  0.04828522,
         0.0075543 , -0.02961659],
       [-0.0104174 , -0.0447926 , -0.02429202, ..., -0.00819299,
         0.01503989,  0.01113798]], dtype=float32)

In [16]:
from src.database.qdrant import QdrantVectorDB
from dotenv import load_dotenv
import os

load_dotenv()

True

In [17]:
url = os.getenv("QDRANT_URL")
api_key = os.getenv("QDRANT_API_KEY")

qdrant = QdrantVectorDB(url=url, api_key=api_key, embedding_model=embedding_model)
qdrant

<src.database.qdrant.QdrantVectorDB at 0x3c74dabd0>

# Combine Scopus and Arxiv dataset

In [2]:
import pandas as pd

In [5]:
scopus_df = pd.read_csv("./scopus_unexplode_data.csv")
arxiv_df = pd.read_csv("./cleaned_arxiv.csv")

  arxiv_df = pd.read_csv("./cleaned_arxiv.csv")


In [6]:
scopus_df.head()

Unnamed: 0,authors,year,id,title,abstract,references,category,source
0,"Sureerat Thuekeaw, Kris Angkanaporn, Chackrit ...",2022,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"['85039040394', '85050697915', '84920164411', ...","[('Food Science', '1106', 'AGRI'), ('Physiolog...",scopus
1,"Abhijit V. Lele, Sarah Wahlster, Bhunyawee Alu...",2022,85121351780,Perceptions Regarding the SARS-CoV-2 Pandemic'...,Background: The SARS-CoV-2 (COVID-19) pandemic...,"['85104589379', '85083241171', '85078262578', ...","[('Surgery', '2746', 'MEDI'), ('Neurology (cli...",scopus
2,"Solaphat Hemrungrojn, Arisara Amrapala, Michae...",2022,85131660961,Construction of a short version of the Montrea...,Background: The Montreal Cognitive Assessment ...,"['84982975791', '84871671961', '85097597113', ...","[('Neuroscience (all)', '2800', 'NEUR')]",scopus
3,"Erik Johansson, Ferenc Tasnádi, Annop Ektarawo...",2022,85124670542,The effect of strain and pressure on the elect...,Different theoretical methodologies are employ...,"['0035282206', '0035508561', '0035858409', '18...","[('Physics and Astronomy (all)', '3100', 'PHYS')]",scopus
4,"Kunanya Masodsai, Rungchai Chaunchaiyakul,",2022,85143878806,Dynamic Cardiopulmonary and Metabolic Function...,The purpose of this study was to investigate a...,"['25444452457', '49949090130', '84860884417', ...","[('Physiology (medical)', '2737', 'MEDI')]",scopus


In [7]:
arxiv_df.head()

Unnamed: 0,id,authors,title,references,category,abstract,year,source
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"Phys.Rev.D76:013009,2007",hep-ph,A fully differential calculation in perturba...,2008,arxiv
1,704.0005,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,"Illinois J. Math. 52 (2008) no.2, 681-689",math.CA math.FA,In this paper we show how to compute the $\L...,2013,arxiv
2,704.0007,"Alejandro Corichi, Tatjana Vukasinac and Jose ...",Polymer Quantum Mechanics and its Continuum Limit,"Phys.Rev.D76:044016,2007",gr-qc,A rather non-standard quantum representation...,2008,arxiv
3,704.0008,Damian C. Swift,Numerical solution of shock and ramp compressi...,"Journal of Applied Physics, vol 104, 073536 (2...",cond-mat.mtrl-sci,A general formulation was developed to repre...,2009,arxiv
4,704.0009,"Paul Harvey, Bruno Merin, Tracy L. Huard, Luis...","The Spitzer c2d Survey of Large, Nearby, Inste...","Astrophys.J.663:1149-1173,2007",astro-ph,We discuss the results from the combined IRA...,2010,arxiv


In [8]:
combine_df = pd.concat([scopus_df, arxiv_df], ignore_index=True)
# re-arrange columns
combine_df = combine_df[["id", 'title', 'abstract', 'authors', 'category', 'year', 'source', 'references']]
combine_df.head()

Unnamed: 0,id,title,abstract,authors,category,year,source,references
0,85131139456,Microencapsulated basil oil (Ocimum basilicum ...,Objective: Microencapsulation is a technique t...,"Sureerat Thuekeaw, Kris Angkanaporn, Chackrit ...","[('Food Science', '1106', 'AGRI'), ('Physiolog...",2022,scopus,"['85039040394', '85050697915', '84920164411', ..."
1,85121351780,Perceptions Regarding the SARS-CoV-2 Pandemic'...,Background: The SARS-CoV-2 (COVID-19) pandemic...,"Abhijit V. Lele, Sarah Wahlster, Bhunyawee Alu...","[('Surgery', '2746', 'MEDI'), ('Neurology (cli...",2022,scopus,"['85104589379', '85083241171', '85078262578', ..."
2,85131660961,Construction of a short version of the Montrea...,Background: The Montreal Cognitive Assessment ...,"Solaphat Hemrungrojn, Arisara Amrapala, Michae...","[('Neuroscience (all)', '2800', 'NEUR')]",2022,scopus,"['84982975791', '84871671961', '85097597113', ..."
3,85124670542,The effect of strain and pressure on the elect...,Different theoretical methodologies are employ...,"Erik Johansson, Ferenc Tasnádi, Annop Ektarawo...","[('Physics and Astronomy (all)', '3100', 'PHYS')]",2022,scopus,"['0035282206', '0035508561', '0035858409', '18..."
4,85143878806,Dynamic Cardiopulmonary and Metabolic Function...,The purpose of this study was to investigate a...,"Kunanya Masodsai, Rungchai Chaunchaiyakul,","[('Physiology (medical)', '2737', 'MEDI')]",2022,scopus,"['25444452457', '49949090130', '84860884417', ..."


In [9]:
# check the length of abstract
combine_df["abstract"].apply(lambda x: len(x.split())).describe()

count    888907.000000
mean        137.417608
std          62.787405
min           1.000000
25%          90.000000
50%         129.000000
75%         177.000000
max        1194.000000
Name: abstract, dtype: float64

In [10]:
combine_df["id"] = combine_df["id"].astype(str)

In [11]:
combine_df.to_csv("combined_data.csv", index=False)