In [2]:
import pandas as pd
import datasets
import tqdm, os
from collections.abc import Iterable
from langchain_core.documents.base import Document

from datetime import datetime
from copy import deepcopy

from dotenv import load_dotenv
load_dotenv()

huggingface_path = "ErzhuoShao/SciSciGPT-SciSciCorpus"
revision = "475c99a8c2afab3c6a7e2e936d8b44c0137437b3"
sciscicorpus = datasets.load_dataset(
	huggingface_path, split="train", revision=revision
).to_pandas()[:100]

In [3]:
def filter_nan(d):
    d2 = {}
    for k, v in d.items():
        if k in ['section_summary', 'abstract', 'section_text_token_count']:
            continue
        if k in ['section_text']:
            d2["text"] = v[:25000]
            continue
        if k == "section_id":
            d2["section_id"] = int(v)
            continue

        if k == "date":
            if type(v) == str:
                d2["date"] = v
                d2["year"] = int(v.split("-")[0])
            continue

        if k == "author":
            if type(v) == str:
                d2["author"] = v
                authors = v.split(" and ")
                for i in authors:
                    author_name = " ".join(i.split(", ")[::-1])
                    d2["author: {}".format(author_name)] = True
            continue
        
        if k == "embedding":
            d2[k] = v
            continue

        if k in ["urldate", "number"]:
            continue

        if pd.isna(v):
            continue
        
        else:
            d2[k] = v

        if k == "authors":
            d2["authors"] = [' '.join(i.split(', ')[::-1]) for i in v.split(" and ")]
    return d2

documents = [Document(page_content=i['section_summary'], metadata=filter_nan(i)) for i in sciscicorpus.to_dict('records')]

In [4]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

page_contents = [i.page_content for i in documents]
embedding_list = []
for i in tqdm.trange(0, len(documents), 128):
    embedding_list += embeddings.embed_documents(page_contents[i:i+128])
sciscicorpus["embedding"] = embedding_list

100%|██████████| 1/1 [00:01<00:00,  1.22s/it]


In [5]:
from pinecone import Pinecone
pc = Pinecone()

sciscicorpus_index = os.getenv("SCISCICORPUS_INDEX")
sciscicorpus_namespace = os.getenv("SCISCICORPUS_NAMESPACE")
index = pc.Index(sciscicorpus_index)

In [6]:
for i in tqdm.trange(len(documents)):
    document = deepcopy(documents[i])
    embedding = document.metadata.pop("embedding")
    url = document.metadata['url']

    index.upsert(
        vectors=[
            {
                "id": document.metadata['url'] + '::' + str(document.metadata['section_id']),
                "values": embedding,
                "metadata": document.metadata
            }
        ],
        namespace=sciscicorpus_namespace
    )

100%|██████████| 100/100 [00:14<00:00,  6.87it/s]
