In [55]:
from dotenv import load_dotenv
import pandas as pd
from datasets import load_dataset
import os
import openai
from tqdm.auto import tqdm
import pinecone

In [56]:
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_KEY")
PINECONE_KEY = os.getenv("PINECONE_KEY")

In [57]:
openai.api_key = OPENAI_KEY

In [58]:
openai.Engine.list()

<OpenAIObject list at 0x21818740a90> JSON: {
  "data": [
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-edit-001",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-similarity-babbage-001",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "code-davinci-edit-001",
      "object": "engine",
      "

### We will be using the OPENAI ada-002 model for text embeddings

In [59]:
MODEL = "text-embedding-ada-002"
# testing it out
res = openai.Embedding.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], engine=MODEL
)
res

<OpenAIObject list at 0x21818705b80> JSON: {
  "data": [
    {
      "embedding": [
        -0.0031135426834225655,
        0.011766765266656876,
        -0.00509151816368103,
        -0.027159256860613823,
        -0.01633599027991295,
        0.03237545117735863,
        -0.016160769388079643,
        -0.0010808103252202272,
        -0.02583836019039154,
        -0.006641550455242395,
        0.02012345939874649,
        0.016672953963279724,
        -0.009178885258734226,
        0.02331787347793579,
        -0.010149340145289898,
        0.013458321802318096,
        0.02527226135134697,
        -0.016915567219257355,
        0.012056553736329079,
        -0.01636294648051262,
        -0.004303023684769869,
        -0.006402306258678436,
        -0.00437378603965044,
        0.020810864865779877,
        -0.010567175224423409,
        -0.003726816037669778,
        0.013626803644001484,
        -0.02635054476559162,
        -0.0004172029148321599,
        -0.0021852082572877407,
  

In [60]:
print(len(res['data'][0]['embedding']))
print(len(res['data'][1]['embedding']))

1536
1536


### Dataset Preparation

In [61]:
data_files = "arxiv-metadata-oai-snapshot.json"
arxiv_dataset = load_dataset("json", data_files=data_files, split="train[:15000]")
arxiv_dataset

Found cached dataset json (C:/Users/techi/.cache/huggingface/datasets/json/default-3c18e91474a8e8f0/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)


Dataset({
    features: ['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi', 'report-no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed'],
    num_rows: 15000
})

In [62]:
print("Length of Abstract:", len(arxiv_dataset[0]['abstract']))

Length of Abstract: 983


In [63]:
arxiv_dataset[0]

{'id': '0704.0001',
 'submitter': 'Pavel Nadolsky',
 'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'comments': '37 pages, 15 figures; published version',
 'journal-ref': 'Phys.Rev.D76:013009,2007',
 'doi': '10.1103/PhysRevD.76.013009',
 'report-no': 'ANL-HEP-PR-07-12',
 'categories': 'hep-ph',
 'license': None,
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from th

In [64]:
columns = arxiv_dataset.column_names
columns_to_keep = ["title", "doi", "abstract", 'id']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
arxiv_dataset = arxiv_dataset.remove_columns(columns_to_remove)
arxiv_dataset

Dataset({
    features: ['id', 'title', 'doi', 'abstract'],
    num_rows: 15000
})

In [65]:
arxiv_dataset[0]

{'id': '0704.0001',
 'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
 'doi': '10.1103/PhysRevD.76.013009',
 'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboso

In [66]:
sample0 = openai.Embedding.create(
    input=[
        arxiv_dataset[0]['abstract']
    ], engine=MODEL
)
len(sample0['data'][0]['embedding'])

1536

In [67]:
sample0['data'][0]

<OpenAIObject embedding at 0x2181831ae50> JSON: {
  "embedding": [
    -0.02265041135251522,
    0.009236225858330727,
    -0.02350277081131935,
    -0.01939467526972294,
    0.0009353250497952104,
    0.021518588066101074,
    -0.01410585455596447,
    -0.01961824670433998,
    -0.05457896739244461,
    -0.022231217473745346,
    0.019296864047646523,
    0.030014239251613617,
    -0.03442974388599396,
    0.005470472387969494,
    -0.003968363162130117,
    -0.00634029833599925,
    0.022664383053779602,
    0.007706169970333576,
    0.0019492488354444504,
    -0.004960454069077969,
    0.018025310710072517,
    0.007161218672990799,
    -0.0030059651471674442,
    -0.02195175550878048,
    -0.011031770147383213,
    0.04136040434241295,
    0.025263382121920586,
    -0.0170891135931015,
    -0.03241761401295662,
    -0.006899222731590271,
    0.0197859238833189,
    0.01690746285021305,
    -0.009634459391236305,
    -0.004184946417808533,
    0.013707620091736317,
    -0.0120518067

#### Indexing

In [51]:
index_name = 'semantic-search'

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key=PINECONE_KEY,
    environment="us-west1-gcp"  # find next to api key in   console
)
# check if index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=1536) # 1536 is the output dimension of ada model
# connect to index
index = pinecone.Index(index_name)

In [68]:
count = 0  # we'll use the count to create unique IDs
batch_size = 32  # process everything in batches of 32

for i in tqdm(range(0, len(arxiv_dataset['abstract']), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(arxiv_dataset['abstract']))
    # get batch of lines and IDs
    abstracts_batch = arxiv_dataset['abstract'][i: i+batch_size]
    titles_batch = arxiv_dataset['title'][i: i+batch_size]
    id_list = arxiv_dataset['id'][i: i+batch_size]
    

    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=abstracts_batch, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    meta = [{'abstract': abstract, 'title': title, 'id': id_item} for abstract, title, id_item in zip(abstracts_batch, titles_batch, id_list)]
    
    to_upsert = zip(ids_batch, embeds, meta)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))


100%|██████████| 469/469 [28:35<00:00,  3.66s/it]  


### Querying

In [69]:
query = "I-V characteristics of MgB2"

xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']

In [70]:
res = index.query([xq], top_k=5, include_metadata=True)
res

{'matches': [{'id': '125',
              'metadata': {'abstract': '  The current-voltage (I-V) '
                                       'characteristics of various MgB2 films '
                                       'have been\n'
                                       'studied at different magnetic fields '
                                       'parallel to c-axis. At fields \\mu0H\n'
                                       'between 0 and 5T, vortex liquid-glass '
                                       'transitions were found in the I-V\n'
                                       'isotherms. Consistently, the I-V '
                                       'curves measured at different '
                                       'temperatures show\n'
                                       'a scaling behavior in the framework of '
                                       'quasi-two-dimension (quasi-2D) vortex\n'
                                       'glass theory. However, at \\mu0 H >= '
      

In [71]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['title']}")

0.88: I-V characteristics of the vortex state in MgB2 thin films
0.85: Physical properties of the noncentrosymmetric superconductor
  Mg_10Ir_19B_16
0.84: Superconducting MgB2 thin films nano-bridges for cryo-electronic
  application
0.83: Collective Charge Excitations below the Metal-to-Insulator Transition in
  BaVS3
0.83: Peculiar Ferrimagnetism Associated with Charge Order in Layered
  Perovskite GdBaMn2O5


#### Translation

In [80]:
query1 = "Las características de corriente-voltaje de varias películas de MgB2"

In [81]:
response = openai.Completion.create(
  model="text-davinci-003",
  prompt="Translate this to English: " + query1,
  temperature=0.3,
  max_tokens=100,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)

In [82]:
response['choices'][0]['text']

'\n\nThe current-voltage characteristics of various MgB2 films.'

In [87]:
xq = openai.Embedding.create(input=response['choices'][0]['text'], engine=MODEL)['data'][0]['embedding']

In [88]:
res = index.query([xq], top_k=5, include_metadata=True)
res

{'matches': [{'id': '125',
              'metadata': {'abstract': '  The current-voltage (I-V) '
                                       'characteristics of various MgB2 films '
                                       'have been\n'
                                       'studied at different magnetic fields '
                                       'parallel to c-axis. At fields \\mu0H\n'
                                       'between 0 and 5T, vortex liquid-glass '
                                       'transitions were found in the I-V\n'
                                       'isotherms. Consistently, the I-V '
                                       'curves measured at different '
                                       'temperatures show\n'
                                       'a scaling behavior in the framework of '
                                       'quasi-two-dimension (quasi-2D) vortex\n'
                                       'glass theory. However, at \\mu0 H >= '
      

In [89]:
for match in res['matches']:
    print(f"{match['score']:.2f}: {match['metadata']['title']}")

0.90: I-V characteristics of the vortex state in MgB2 thin films
0.87: Superconducting MgB2 thin films nano-bridges for cryo-electronic
  application
0.86: MgB2 radio-frequency superconducting quantum interference device
  prepared by atomic force microscope lithography
0.85: Transport properties of microstructured ultrathin films of
  La0.67Ca0.33MnO3 on SrTiO3
0.85: Probing the electron-phonon coupling in MgB2 through magnetoresistance
  measurements in neutron irradiated thin films
