In [1]:
from typing import Optional, List, Dict

import os
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

from langchain.schema import Document
from langchain.document_loaders import UnstructuredExcelLoader
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS, Chroma #pip install chromadb==0.4.15
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [2]:
import csv
columns_to_embed = ["주제","도메인","논문명","IF"]
columns_to_metadata = ["행","저널명"]
docs = []
with open('/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN 논문리스트.csv', newline="", encoding='utf-8-sig') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    for i, row in enumerate(csv_reader):
        to_metadata = {col: row[col] for col in columns_to_metadata if col in row}
        values_to_embed = {k: row[k] for k in columns_to_embed if k in row}
        to_embed = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in values_to_embed.items())
        newDoc = Document(page_content=to_embed, metadata=to_metadata)
        docs.append(newDoc)
        
splitter = CharacterTextSplitter(separator = "\n",
                                chunk_size=500, 
                                chunk_overlap=0,
                                length_function=len)
documents = splitter.split_documents(docs)

In [9]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.agents import create_pandas_dataframe_agent

llm=ChatOpenAI(temperature=0)
metadata_field_info=[
     AttributeInfo(
        name="주제",
        description="Indicates the subject category of the paper", 
        type="string", 
    ),
    AttributeInfo(
        name="도메인",
        description="Indicates what data was used in the paper", 
        type="string", 
    ),
    AttributeInfo(
        name="논문명",
        description="Indicates what data was used in the paper", 
        type="string", 
    ),
    AttributeInfo(
        name="IF",
        description="Indicates the impact factor of the journal in which the paper is published", 
        type="float", 
    ),
    AttributeInfo(
        name="저널명",
        description="Represents the journal in which the paper was published", 
        type="string", 
    ),
]
# https://www.neum.ai/post/llm-spreadsheets
# vectorstore = FAISS.from_documents(documents, OpenAIEmbeddings())
vectorstore = Chroma.from_documents(documents,
                                    OpenAIEmbeddings(),
                                    persist_directory='./Retrieval_Database/Vector_Database/paper_info_chroma_openai_128_sentence_splitter')
document_content_description = "Graph natural network paper list and information for each paper"
retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, search_kwargs={"k": 4},verbose=True
)

In [10]:
retriever.get_relevant_documents("dropedge?")

[Document(page_content='주제: Model\n도메인: \n논문명: DROPEDGE: TOWARDS DEEP GRAPH CONVOLU- TIONAL NETWORKS ON NODE CLASSIFICATION\nIF:', metadata={'저널명': 'arXiv ', '행': '29'}),
 Document(page_content='주제: Model\n도메인: \n논문명: A Graph Convolutional Incorporating GRU Network for Landslide Displacement Forecasting Based on Spatiotemporal Analysis of GNSS Observations\nIF: 5.349', metadata={'저널명': 'MDPI remote sensing', '행': '56'}),
 Document(page_content='주제: Model\n도메인: Traffic\n논문명: Graph-GAN: A spatial-temporal neural network for short-term passenger flow prediction in urban rail transit systems\nIF: 9.69', metadata={'저널명': 'transportation research part c', '행': '6'}),
 Document(page_content='주제: \n도메인: Covid\n논문명: Transportation, germs, culture: a dynamic graph model of COVID-19 outbreak\nIF: 3.54', metadata={'저널명': 'Quantitative Biology', '행': '73'})]

In [14]:
loader = UnstructuredExcelLoader("/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN 논문리스트.xlsx", mode="elements")
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=30, separator="\n")
docs = text_splitter.split_documents(documents=documents)

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)

qa = RetrievalQA.from_chain_type(llm=OpenAI(),
                                 chain_type='refine',
                                 retriever = vectorstore.as_retriever(search_kwargs={"k": 4}),
                                 return_source_documents=True
                                 )

In [10]:
loader = UnstructuredCSVLoader(
    file_path="/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN 논문리스트.csv", mode="elements"
)
docs = loader.load()

In [15]:
print(docs[0].metadata['text_as_html'])

<table border="1" class="dataframe">
  <tbody>
    <tr>
      <td></td>
      <td>주제</td>
      <td>도메인</td>
      <td>논문명</td>
      <td>IF</td>
      <td>저널명</td>
    </tr>
    <tr>
      <td>1.0</td>
      <td></td>
      <td>Covid, weather</td>
      <td>The impact of weather condition and social activity on COVID-19 transmission in the United States</td>
      <td>6.789</td>
      <td>Journal of Environmental Management</td>
    </tr>
    <tr>
      <td>2.0</td>
      <td></td>
      <td>Covid</td>
      <td>A deep spatio-temporal meta-learning model for urban traffic revitalization index prediction in the COVID-19 pandemic</td>
      <td>5.603</td>
      <td>Advanced Engineering Informatics</td>
    </tr>
    <tr>
      <td>3.0</td>
      <td>Model</td>
      <td></td>
      <td>GNNExplainer: Generating Explanations for Graph Neural Networks</td>
      <td></td>
      <td>arXiv</td>
    </tr>
    <tr>
      <td>4.0</td>
      <td></td>
      <td>Covid</td>
      <td>A Spatial-Tem

In [6]:
res = qa({'query' : 'Covid 도메인의 논문이름 알려줘'})
# k를 어떻게 설정해야 할까. streamlit의 인풋으로줘야할까?
res['result'], res['source_documents']

('\nThe Impact of Weather Condition and Social Activity on COVID-19 Transmission in the United States: An Epidemiological Neural Network Exploiting Dynamic Graph Structured Data Applied to the COVID-19 Outbreak, as published in the AAAI Digital Library.  Predicting the Dynamics of the COVID-19 Pandemic in the United States Using Graph Theory-Based Neural Networks, as published in MDPI IJERPH. The COVID-19 Infection Diffusion in the US and Japan: A Graph-Theoretical Approach, as published in MDPI Biology.',
 [Document(page_content='주제\n도메인\n논문명\nIF\n저널명\n1.0\nCovid, weather\nThe impact of weather condition and social activity on COVID-19 transmission in the United States\n6.789\nJournal of Environmental Management\n2.0\nCovid', metadata={'source': '/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN 논문리스트.xlsx', 'filename': 'GNN 논문리스트.xlsx', 'file_directory': '/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference', 'last_modified': '2023-10-22T20:42:38', 'filetype': 'application/v

https://teddylee777.github.io/langchain/langchain-tutorial-04/

엑셀 말고 csv로 읽어와서 데이터프레임으로 만들어서 뭔가를 할 수 있음

Predicting the Dynamics of the COVID-19 Pandemic in the United States Using Graph Theory-Based Neural Networks, 

The COVID-19 Infection Diffusion in the US and Japan: A Graph-Theoretical Approach, Published in MDPI Biology;

A Spatial-Temporal Graph Based Hybrid Infectious Disease Model with Application to COVID-19, Published on arXiv;

Spatial Visualization of Cluster-Specific COVID-19 Transmission Network in South Korea During the Early Epidemic Phase, Published on arXiv;

Traffic Model for COVID-19 Spread Prevention and Mitigation, Published on arXiv;

Prediction of the confirmed cases and deaths of global COVID-19 using artificial intelligence, Published in Environmental Science and Pollution Research;

COVID-19 Forecasting With Global AI-Powered Surveillance System, Published on arXiv;

Forecasting COVID-19 confirmed cases in South Korea using Spatio-Temporal Graph Neural Networks, Published on the Korea Contents Association;

The Prediction of COVID-19 Infection and Death Rates Using Artificial Intelligence, Published on ar # 임베딩 잘못됨