In [32]:
import requests
import pandas
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pinecone import Pinecone
from supabase.client import Client, create_client
import datetime
from tqdm import tqdm
import requests
import datetime
import json
import io
import uuid
import pinecone
import os
from dotenv import load_dotenv
load_dotenv()

True

In [38]:
supabase: Client = create_client(supabase_url=os.environ.get("SUPABASE_URL"), supabase_key=os.environ.get("SUPABASE_SERVICE_KEY"))

In [39]:
logs = []

In [40]:

def clean_up_pdf_json(json_data):
    # print("length of json data: " + str(len(json_data)))
    result = []
    try:
        
        for i, node in enumerate(json_data):
            # print(node)
            if node['type'] == 'Title' and ('References'.lower() in node['text'].lower() or 'Works cited'.lower() in node['text'].lower()) and i > len(json_data) / 3.5:
                # print("hit referencessssssss")
                raise Exception('hit references')
            elif node['type'] == 'NarrativeText':
                if len(node['text']) > 150:
                    result.append(node['text'])
                    # sentences = split_sentences(node['text'])
                    # for sentence in sentences:
                    #     if sentence['type'] == 'Sentence' and len(sentence['raw']) > 70:
                    #         result.append(sentence['raw'])
    except Exception as e:
        logs.append('error in clean_up_pdf_json')
        logs.append(e)
        pass
    return result

def clean_up_pdf(pdf_url):
    logs.append(url)
    start = datetime.datetime.now()
    # Download the PDF from the URL
    tempId = str(uuid.uuid4())
    filename = 'temp'+tempId+'.pdf'
    with open(filename, 'wb') as f:
        response = requests.get(pdf_url)
        f.write(response.content)

    headers = {
        'accept': 'application/json',
        'unstructured-api-key': 'ZlAGOeaXIItDzP6DQjCSwYqBpKzzyZ'
    }

    files = {'files': (filename, open(filename, 'rb'))}

    response = requests.post('https://api.unstructured.io/general/v0/general', headers=headers, files=files)
    os.remove(filename)
    if response.status_code != 200:
        logs.append('ERROR: unstructured status code is ' + str(response.status_code))
        logs.append(response.text)
        return []
    
    text = response.text
    # print('unstructured time is ' + str((datetime.datetime.now() - start).total_seconds() * 1000) + 'ms')

    start = datetime.datetime.now()

    if text:
        json_data = json.loads(text)
        # return json_data
        cleaned = clean_up_pdf_json(json_data)
        # print('cleaned json is ', cleaned)
        # print('clean up time is ' + str((datetime.datetime.now() - start).total_seconds() * 1000) + 'ms')
        return cleaned
    else:
        # print('ERROR: no text from unstructured')
        return []

In [41]:
index_name = "cadmir"
namespace = "cad1"
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENV")
)
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=1536  
    )
    print("created index")
papersDF = pandas.read_csv('csv-microRNAAN-set (1).csv')
embedding = OpenAIEmbeddings()
loader_not_works = []
no_url = []
works=[]

In [43]:
pbar = tqdm(total=len(papersDF))
for index, row in papersDF.iterrows():
    pbar.update(1)
    pbar.refresh()
    if index != 0:
        pbar.set_description(f"{(len(no_url) + len(loader_not_works)) / index * 100}% failure")
    if index <= 4:
        continue
    doi = row['DOI']
    # print(f"https://api.unpaywall.org/v2/{doi}?email=ryandu9221@gmail.com")
    req = requests.get(f"https://api.unpaywall.org/v2/{doi}?email=ryandu9221@gmail.com")
    if req.status_code == 200:
        if req.json().get('is_oa') == True:
            oa = req.json().get('best_oa_location')
            if oa == None:
                no_url.append(doi)
                print("no oa " + doi)
                continue
            url = oa.get('url_for_pdf')
            if url is None:
                no_url.append(doi)
                print("no url " + doi)
                continue
            try:
                cleaned_text = clean_up_pdf(url)
                cleaned_text_batch = ''.join(cleaned_text)
                
                splitter = RecursiveCharacterTextSplitter(
                    chunk_size = 1000,
                    chunk_overlap  = 0,
                    length_function = len,
                    is_separator_regex = False,
                )
                chunks = splitter.create_documents([cleaned_text_batch])
                for chunk in chunks:
                    chunk.metadata["doi"] = doi
                    chunk.metadata["add_date"] = str(datetime.datetime.now().date())
                    # chunk.metadata.pop("source")
                embedded = Pinecone.from_documents(chunks, embedding,index_name=index_name,namespace=namespace )
                supabase.table("papers").insert({"doi": doi}).execute()
                # print("sucess for " + doi)
            except Exception as error:
                loader_not_works.append(doi)
                # print(f"not work for {index} " + doi)
                print(error)
    else:
        no_url.append(doi)
    
pbar.close()

0.0% failure:   0%|          | 9/4924 [01:08<10:26:30,  7.65s/it]
0.0% failure:   0%|          | 8/4924 [00:36<7:22:38,  5.40s/it]

KeyboardInterrupt: 

# Test Cycle

In [None]:
test_doi = "10.1038/s41584-020-0426-0"

In [None]:
req = requests.get(f"https://api.unpaywall.org/v2/{test_doi}?email=ryandu9221@gmail.com")
url = req.json().get('best_oa_location').get('url')
fileloader = OnlinePDFLoader(url)
loader = UnstructuredPDFLoader(fileloader.file_path)
splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)
chunks = loader.load_and_split(text_splitter=splitter)
for chunk in chunks:
    chunk.metadata["doi"] = test_doi
    chunk.metadata["add_date"] = str(datetime.datetime.now().date())
    chunk.metadata.pop("source")

In [None]:
embedded = SupabaseVectorStore.from_documents(chunks, embedding, client=supabase, table_name="knowledge")