In [3]:
import sys
# sys.path.append('..')
sys.path.insert(0, '..')

### Set OpenAI key 

In [13]:
import os
import configparser

config = configparser.ConfigParser()
config.read('../../.secrets.ini')
OPENAI_API_KEY = config['OPENAI']['OPENAI_API_KEY']
YOUTUBE_KEY = config['YOUTUBE']['YOUTUBE_API_KEY']
NAVER_CLIENT_ID = config['NAVER']['NAVER_CLIENT_ID']
NAVER_CLIENT_SECRET = config['NAVER']['NAVER_CLIENT_SECRET']
GOOGLE_SEARCH_KEY = config['GOOGLE']['GOOGLE_API_KEY']
CSE_ID = config['GOOGLE']['CSE_ID']
SERPAPI_API_KEY = config['SERPAPI']['SERPAPI_API_KEY']


os.environ.update({'OPENAI_API_KEY': OPENAI_API_KEY})
os.environ.update({'YOUTUBE_KEY': YOUTUBE_KEY})
os.environ.update({'NAVER_CLIENT_ID': NAVER_CLIENT_ID})
os.environ.update({'NAVER_CLIENT_SECRET': NAVER_CLIENT_SECRET})
os.environ.update({'GOOGLE_SEARCH_KEY': GOOGLE_SEARCH_KEY})
os.environ.update({'CSE_ID': CSE_ID})
os.environ.update({'SERPAPI_API_KEY': SERPAPI_API_KEY})

In [20]:
from tools import SearchByURLTool
search_by_url_tool = SearchByURLTool()

In [21]:
import ast
import asyncio
import requests
from typing import Any, Dict, Optional, Type, Union
from pydantic import BaseModel, Field, root_validator

from langchain.tools import BaseTool
from bs4 import BeautifulSoup

from api import KostatAPI, GallupAPI, YoutubeAPI, GoogleSearchAPI, NaverSearchAPI, SerpApiSearch
from models.llm import UnifiedSummaryChunkChain, WebIntegrateSearchChain

class SearchToolInputSchema(BaseModel):
    query: str
    question: str

class SearchTool(BaseTool):
    category_api_dict = {
        'all': [KostatAPI(), YoutubeAPI(), GoogleSearchAPI(), NaverSearchAPI()],
        # 'all': [KostatAPI(), GallupAPI(), YoutubeAPI(), SerpApiSearch(), NaverSearchAPI()],
        'kostat': [KostatAPI()],
        'gallup': [GallupAPI()],
        'youtube': [YoutubeAPI()],
        'google': [GoogleSearchAPI()],
        # 'google': [SerpApiSearch()],
        'naver': [NaverSearchAPI()],
        # TODO: Add more APIs
    }
    name = "search_tool"
    description = "A tool to search on internet. The input consists of a 'query' and a 'question', where 'query' is the search query and 'question' is the information you want to get. For example, {'query': 'bitcoin price history', 'question': 'what was the price of bitcoin in 2022?'} would be the input if you want to know the price of bitcoin in 2022. Input must contain both a query and a question."
    args_schema: Optional[Type[BaseModel]] = SearchToolInputSchema
    """Pydantic model class to validate and parse the tool's input arguments."""
    summary_chunk_chain: Any = None
    search_by_url_tool: Any = None
    web_integrate_search_chain: Any = None

    def __init__(self, search_by_url_tool, summary_chunk_chain=None, summary_chunk_template=None, input_variables=None) -> None:
        super().__init__()
        if summary_chunk_chain is None:
            self.summary_chunk_chain = UnifiedSummaryChunkChain(verbose=True)
        else:
            self.summary_chunk_chain = summary_chunk_chain
        self.search_by_url_tool = search_by_url_tool
        self.web_integrate_search_chain = WebIntegrateSearchChain(verbose=True)

    def search(self, query, category='all', top_k=5):
        api_list = self.category_api_dict[category]
        result_dict = {}
        for api in api_list:
            result = api.search(query, top_k=top_k)
            result_dict[api.name] = result
        return result_dict

    async def async_search(self, query, category='all'):
        api_list = self.category_api_dict[category]
        result_dict = {}
        
        # Run all API searches concurrently using asyncio.gather
        results = await asyncio.gather(*[api.async_search(query) for api in api_list])

        # Map results to their respective API names
        for api, result in zip(api_list, results):
            result_dict[api.name] = result

        return result_dict

    def _parse_input(self, tool_input: Union[str, Dict]) -> Union[str, Dict[str, Any]]:
        """Convert tool input to pydantic model."""
        input_args = self.args_schema
        if isinstance(tool_input, str):
            input_dict = ast.literal_eval(tool_input)
            
            return input_dict
        else:
            if input_args is not None:
                result = input_args.parse_obj(tool_input)
                return {k: v for k, v in result.dict().items() if k in tool_input}
        return tool_input


    def _run(self, query, question=None) -> dict:
        query = query.replace('"', '').replace("'", '')
        if question is None:
            question = query
        search_results = self.search(query, category='google', top_k=5)['google_search']
        if len(search_results) == 0:
            return {
                "status": "fail",
                "message": "No results found."
            }
        # chunk = search_result_list[0]
        # url = chunk['data_path']
        # 여기서 chain을 사용하여 전체 검색결과에서 답을 찾을 수 있는지 확인한다.
        final_answer_type, final_answer, final_answer_url = self.web_integrate_search_chain.run(purpose=question, search_results=search_results)
        # 결과는 답이 나오거나 or 추가 검색을 위한 url이 나온다.
        if final_answer_type == 'A':
            return {
                "status": "success",
                "answer": final_answer,
                "url": final_answer_url
            }
        elif final_answer_type == 'B':
            url = final_answer_url
            single_webpage_result = self.search_by_url_tool._run(url)
            if single_webpage_result['status'] == 'error':
                return {
                    "status": "fail",
                    f"message": f"No results found on {url}"
                    }
            single_webpage_content = single_webpage_result['content']
            # 여기서 semantic search로 답과 관련된 부분을 찾기
            summary_result = self.summary_chunk_chain.run(chunk=single_webpage_content, question=question)
            question = question if question else query
            return summary_result
        elif final_answer_type == 'C':
            return {
                "status": "fail",
                "message": "No Useful results found."
            }

    async def _arun(self, query) -> dict:
        # Not implemented
        return await self.async_search(query, category='google')
    
class SearchByURLTool(BaseTool):
    name = "search_by_url_tool"
    description = "A tool to search on internet by url."
    args_schema: Optional[Type[BaseModel]] = None
    """Pydantic model class to validate and parse the tool's input arguments."""

    def _run(self, query: str) -> dict:
        # Trim single and double quotes
        url = query.strip("'\"")
        
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            
            # Check if the request was successful
            response.raise_for_status()

            # Use BeautifulSoup to parse the content and extract text
            soup = BeautifulSoup(response.content, 'html.parser')
            text_content = soup.get_text(separator=' ', strip=True)
            
            return {
                "status": "success",
                "content": text_content
            }
        except requests.RequestException as e:
            # Handle any exceptions that occurred during the request
            return {
                "status": "error",
                "message": str(e)
            }

    async def _arun(self, query) -> dict:
        # You might want to use an async library like 'httpx' here for asynchronous requests
        pass

In [22]:
search_tool = SearchTool(search_by_url_tool=search_by_url_tool)

In [57]:
search_results = search_tool.search(query='미국 금리', category='google', top_k=5)['google_search']

In [58]:
web_integrate_search_template = '''
Based on your goals and the web search results.
Return the Answer Type(A,B or C) and Answer Description as output.
I want to you to return one of the Action Types below.
<Answer Types>
A. If the search results have an answer, return that answer and URL address where the answer was found.
B. If the answer is likely to be found on a specific page in the search results, return the url.
C. Return "No Useful Infomation" if there is no relevant answer in the search results.
</Answer Types>

Use the following format:
<format>
<Purpose> The purpose you need to solve or the question you need to answer </Purpose>
<Web Search Result> Web search results </Web Search Result>
<Thought> Thoughts for answering </Thought>
<Answer Type> A, B or C </<Answer Type>
<Answer Description> answer for each type of answer type A,B or C </Answer Description>
<Answer link> URL address where the answer was found </Answer link>
</format>

Begin!
<format>
<Purpose> {purpose} </Purpose>
<Web Search Result> {search_results} </Web Search Result>
<Thought> 
'''

In [27]:
from models.llm.chain import BaseChain
from typing import List, Dict

In [51]:
import re

class WebIntegrateSearchChain(BaseChain):
    def __init__(self, 
                web_integrate_search_template=None, 
                input_variables:List[str]=None,
                web_integrate_search_template_path='../openai_skt/models/templates/web_integrate_search_prompt.txt', 
                model='gpt-4', 
                verbose=False) -> None:
        super().__init__(template=web_integrate_search_template, input_variables=input_variables, template_path=web_integrate_search_template_path, model=model, verbose=verbose)

    def run(self, purpose:str=None, search_results:List[Dict[str,str]]=None):
        return super().run(purpose=purpose, search_results=search_results)
    
    async def arun(self, purpose:str=None, search_results:List[Dict[str,str]]=None):
        return await super().arun(purpose=purpose, search_results=search_results)

    def parse_input(self, purpose:str=None, search_results:List[Dict[str,str]]=None):
        search_results_text = ''
        for idx, search_result in enumerate(search_results):
            search_results_text += f"web_result_{idx}(title:{search_result['title']}\tdescription:{search_result['description']}\turl:{search_result['data_path']})\n"
        return {
            "purpose": purpose,
            "search_results": search_results_text
        }        

    def parse_output(self, output:str=None):
        print(output)
        match = re.search(r'<Answer Type>\s*(\w+)\s*</Answer Type>\s*<Answer Description>\s*([^<]+)\s*</Answer Description>\s*<Answer link>\s*([^<]+)\s*</Answer link>', output)
        if match:
            final_answer_type = match.group(1) or ''
            final_answer = match.group(2) or ''
            final_answer_url = match.group(3) or ''
        return final_answer_type, final_answer, final_answer_url

In [52]:
web_integrate_search_chain = WebIntegrateSearchChain(web_integrate_search_template, input_variables=['purpose', 'search_results'], verbose=True)

In [59]:
final_answer_type, final_answer, final_answer_url = web_integrate_search_chain.run(purpose='현재 미국 금리가 얼마야', search_results=search_results)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Based on your goals and the web search results.
Return the Answer Type(A,B or C) and Answer Description as output.
I want to you to return one of the Action Types below.
<Answer Types>
A. If the search results have an answer, return that answer and URL address where the answer was found.
B. If the answer is likely to be found on a specific page in the search results, return the url.
C. Return "No Useful Infomation" if there is no relevant answer in the search results.
</Answer Types>

Use the following format:
<format>
<Purpose> The purpose you need to solve or the question you need to answer </Purpose>
<Web Search Result> Web search results </Web Search Result>
<Thought> Thoughts for answering </Thought>
<Answer Type> A, B or C </<Answer Type>
<Answer Description> answer for each type of answer type A,B or C </Answer Description>
<Answer link> URL address where the answer was found </Answer link>
</forma


[1m> Finished chain.[0m
The search results do not directly provide the current interest rate in the United States. However, the second search result seems to be a site that provides historical data and predictions for US interest rates, which may contain the current interest rate.
</Thought>
<Answer Type> B </Answer Type>
<Answer Description> The current US interest rate is likely to be found on the page indicated by the second search result. </Answer Description>
<Answer link> https://ko.tradingeconomics.com/united-states/interest-rate </Answer link>
</format>


In [60]:
final_answer_type, final_answer, final_answer_url

('B',
 'The current US interest rate is likely to be found on the page indicated by the second search result. ',
 'https://ko.tradingeconomics.com/united-states/interest-rate ')