In [None]:
import pandas as pd
import datasets
import tqdm, os
from collections.abc import Iterable
from langchain_core.documents.base import Document

from datetime import datetime
from copy import deepcopy

huggingface_path = "ErzhuoShao/SciSciGPT-SciSciCorpus:475c99a8c2afab3c6a7e2e936d8b44c0137437b3"
sciscicorpus = datasets.load_dataset(huggingface_path, split="train").to_pandas()

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24946 [00:00<?, ? examples/s]

CPU times: user 7.6 s, sys: 12.2 s, total: 19.8 s
Wall time: 13.5 s


In [None]:
def filter_nan(d):
    d2 = {}
    for k, v in d.items():
        if k in ['section_summary', 'abstract', 'section_text_token_count']:
            continue
        if k in ['section_text']:
            d2["text"] = v[:25000]
            continue
        if k == "section_id":
            d2["section_id"] = int(v)
            continue

        if k == "date":
            if type(v) == str:
                d2["date"] = v
                d2["year"] = int(v.split("-")[0])
            continue

        if k == "author":
            if type(v) == str:
                d2["author"] = v
                authors = v.split(" and ")
                for i in authors:
                    author_name = " ".join(i.split(", ")[::-1])
                    d2["author: {}".format(author_name)] = True
            continue
        
        if k == "embedding":
            d2[k] = v
            continue

        if k in ["urldate", "number"]:
            continue

        if pd.isna(v):
            continue
        
        else:
            d2[k] = v

        if k == "authors":
            d2["authors"] = [' '.join(i.split(', ')[::-1]) for i in v.split(" and ")]
    return d2

documents = [Document(page_content=i['section_summary'], metadata=filter_nan(i)) for i in sciscicorpus.to_dict('records')]

In [None]:
from langchain_openai import OpenAIEmbeddings

openai_api_key = input("Please enter your OpenAI API key: ")
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=openai_api_key)

page_contents = [i.page_content for i in documents]
embedding_list = []
for i in tqdm.trange(0, len(documents), 128):
    embedding_list += embeddings.embed_documents(page_contents[i:i+128])
sciscicorpus["embedding"] = embedding_list

In [None]:
from pinecone import Pinecone
pc = Pinecone()

pinecone_index_name = "SciSciCorpus"
pinecone_namespace = datetime.now().strftime("%m%d%Y")

index = pc.Index(pinecone_index_name)

In [None]:
for i in tqdm.trange(len(documents)):
    document = deepcopy(documents[i])
    embedding = document.metadata.pop("embedding")
    url = document.metadata['url']

    index.upsert(
        vectors=[
            {
                "id": document.metadata['url'] + '::' + str(document.metadata['section_id']),
                "values": embedding,
                "metadata": document.metadata
            }
        ],
        namespace=pinecone_namespace
    )

  0%|          | 0/22328 [00:00<?, ?it/s]

100%|██████████| 22328/22328 [35:16<00:00, 10.55it/s] 
