### AI Semantic Search

In [115]:
import pinecone
from sentence_transformers import SentenceTransformer, util
import requests

In [116]:
from bs4 import BeautifulSoup

In [117]:
model = SentenceTransformer('all-MiniLM-L6-v2')   # 384 dim

In [118]:
def get_html_content(url):
    response = requests.get(url)
    return response.content

In [119]:
def text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(['script']):
        script.extract()
    return soup.get_text()

In [120]:
def split_text_chunks(plain_text, max_chars= 2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else: 
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks                

In [121]:
def scrape_text_from_url(url, max_chars= 2000):
    html_content = get_html_content(url)
    plain_text = text(html_content)
    text_chunks = split_text_chunks(plain_text, max_chars)
    return text_chunks

In [122]:
url = input("Enter the url: ")                    # https://en.wikipedia.org/wiki/V_(singer) 
plain_text_chunks = scrape_text_from_url(url)
print(plain_text_chunks)

["Sachin Tendulkar - Wikipedia                                    Jump to content         Main menu      Main menu move to sidebar hide    \t\tNavigation \t  Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate     \t\tContribute \t  HelpLearn to editCommunity portalRecent changesUpload file     Languages  Language links are at the top of the page across from the title.                    Search                Create accountLog in       Personal tools      Create account Log in     \t\tPages for logged out editors learn more   ContributionsTalk                           Contents move to sidebar hide     (Top)      1Early life and background        2Early career   \t\t\t\tToggle Early career subsection \t\t\t     2.1Yorkshire          3International career   \t\t\t\tToggle International career subsection \t\t\t     3.1Early tours        3.2Rise through the ranks      3.2.11994–1996: ODI matches        3.2.21998: Australian competition        3.2.31999: Asian Test

In [123]:
len(plain_text_chunks)

96

In [124]:
plain_text_chunks[10]

'Tendulkar was the youngest player to debut for India in Tests at the age of 16 years and 205 days, and also the youngest player to debut for India in ODI at the age of 16 years and 238 days.[72][73] Tendulkar made his Test debut against Pakistan in Karachi in November 1989 aged 16 years and 205 days. He scored 15 runs, being bowled by Waqar Younis, who also made his debut in that match. He was noted for how he handled numerous blows to his body at the hands of the Pakistani pace attack.[74] In the fourth and final Test match in Sialkot, he was hit on the nose by a bouncer bowled by Younis, but he declined medical assistance and continued to bat even as he his nose gushed blood.[75] In a 20-over exhibition game in Peshawar, held in parallel with the bilateral series, Tendulkar made 53 runs off 18 balls, including an over in which he scored 27 runs bowled by leg-spinner Abdul Qadir.[76] This was later called "one of the best innings I have seen" by the then Indian captain Krishnamachari

In [153]:
import pinecone

In [126]:
pinecone.init(api_key="a88f41cf-3264-48d8-8a04-953f06214d55", environment = "us-central1-gcp")
index_name = pinecone.Index('semantic-search-ai')
index_name

<pinecone.index.Index at 0x2c5420c14f0>

In [127]:
index_name.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 208}},
 'total_vector_count': 208}

In [128]:
def addData(corpusData,url):
    id  = index_name.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                model.encode(chunk).tolist(),
                {'title': url,'context': chunk})
        index_name.upsert(vectors=[chunkInfo])

In [129]:
plain_text_chunks[20]

"In 2007, in a Test series during India's tour of Bangladesh, Tendulkar returned to his opening slot and was chosen as the Man of the Series.[164] He continued by scoring 99 and 93 in the first two matches of the Future Cup against South Africa. During the second match, he also became the first to score 15,000 runs in ODIs.[165] He was the leading run scorer and was adjudged the Man of the Series.[166][167]   Tendulkar, upon reaching his 38th Test century against Australia in the 2nd Test at the SCG in 2008, where he finished not out on 154 On the second day of the Nottingham Test on 28 July 2007, Tendulkar became the third cricketer to complete 11,000 Test runs.[168] In the subsequent one-day series against England, Tendulkar was the leading run scorer from India[169] with an average of 53.42. In the ODI Series against Australia in October 2007 Tendulkar was the leading Indian run scorer with 278 runs.[170] Tendulkar was dismissed five times in 2007 between 90 and 100, including three

In [130]:
addData(plain_text_chunks,url)

In [131]:
def find_match(query,k):
    query_em = model.encode(query).tolist()
    result = index_name.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

* Using OpenAI

In [133]:
import openai
openai.api_key="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"
def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n\n" + query + "\n"

def generate_answer(prompt):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response.choices[0].text).strip()

In [135]:
query = "Who is Kim Tae-hyung?"
docs,res = find_match(query,1)

In [136]:
res

['Sachin Ramesh Tendulkar, AO (/ˌsʌtʃɪn tɛnˈduːlkər/ (listen); pronounced\xa0[sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsman and cricketer in the history of cricket.[4] He is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[5] He also holds the record for receiving the most man-of-the-match awards in international cricket.[6] Sachin was a Member of Parliament, Rajya Sabha by nomination from 2012 to 2018.[7][8] Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to represent Mumbai domestically and India internationally for over 24 years.[9] In 2002, halfway through his career, Wisden ranked him the second-greatest Test batsman of all time, behind Don Bradman, and the second-greatest ODI

In [137]:
context= "\n\n".join(res)
prompt = create_prompt(context,query)
print(prompt)

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' 
Sachin Ramesh Tendulkar, AO (/ˌsʌtʃɪn tɛnˈduːlkər/ (listen); pronounced [sətɕin teːɳɖulkəɾ]; born 24 April 1973) is an Indian former international cricketer who captained the Indian national team. He is widely regarded as one of the greatest batsman and cricketer in the history of cricket.[4] He is the all-time highest run-scorer in both ODI and Test cricket with more than 18,000 runs and 15,000 runs, respectively.[5] He also holds the record for receiving the most man-of-the-match awards in international cricket.[6] Sachin was a Member of Parliament, Rajya Sabha by nomination from 2012 to 2018.[7][8] Tendulkar took up cricket at the age of eleven, made his Test match debut on 15 November 1989 against Pakistan in Karachi at the age of sixteen, and went on to 

### Semantic Search Using Quora CSV data

* Vector Database: Pinecone

In [154]:
import pandas as pd

In [155]:
data = pd.read_csv('questions.csv')

In [156]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [171]:
pinecone.init(api_key="a88f41cf-3264-48d8-8a04-953f06214d55", environment = "us-central1-gcp")

In [172]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [173]:
embedding = model.encode("This is sentence")
len(embedding)

768

In [169]:
index= pinecone.Index('semantic-search-ai')

In [170]:
question_list = []
for i,row in data.iterrows():
  question_list.append(
      (
        str(row['id']),
        model.encode(row['question1']).tolist(),
        {
            'is_duplicate': int(row['is_duplicate']),
            'question1': row['question1']
        }
      )
  )
  if len(question_list) == 50 or len(question_list) == len(data):
    index.upsert(vectors = question_list)
    question_list = []  

PineconeProtocolError: Failed to connect; did you specify the correct index name?

In [163]:
query = "How do I prepare for civil service?"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k = 2, includeMetadata=True)
result

MaxRetryError: HTTPSConnectionPool(host='semantic-search-ai-unknown.svc.us-central1-gcp.pinecone.io', port=443): Max retries exceeded with url: /query (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002C541F44430>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))