### AI Semantic Search

In [29]:
import pinecone
from sentence_transformers import SentenceTransformer, util   
import requests
from tqdm.autonotebook import tqdm

In [30]:
from bs4 import BeautifulSoup

In [31]:
model = SentenceTransformer('all-MiniLM-L6-v2')   # 384 dim

* 'all-MiniLM-L6-v2' - This is a sentence transformers model: It maps sentence & paragraph to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search

In [32]:
def get_html_content(url):
    response = requests.get(url)
    return response.content

In [33]:
def text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(['script']):
        script.extract()
    return soup.get_text()

In [34]:
def split_text_chunks(plain_text, max_chars= 2000):
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else: 
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks                

In [35]:
def scrape_text_from_url(url, max_chars= 2000):
    html_content = get_html_content(url)
    plain_text = text(html_content)
    text_chunks = split_text_chunks(plain_text, max_chars)
    return text_chunks

In [36]:
url = input("Enter the url: ")                    # https://en.wikipedia.org/wiki/V_(singer) 
plain_text_chunks = scrape_text_from_url(url)
print(plain_text_chunks)

["V (singer) - Wikipedia                                     Jump to content         Main menu      Main menu move to sidebar hide    \t\tNavigation \t  Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate     \t\tContribute \t  HelpLearn to editCommunity portalRecent changesUpload file     Languages  Language links are at the top of the page across from the title.                    Search                Create accountLog in       Personal tools      Create account Log in     \t\tPages for logged out editors learn more   ContributionsTalk                           Contents move to sidebar hide     (Top)      1Early life and education        2Career   \t\t\t\tToggle Career subsection \t\t\t     2.12013–present: BTS        2.22016–present: Solo activities          3Artistry        4Impact and influence        5Personal life   \t\t\t\tToggle Personal life subsection \t\t\t     5.1Health          6Discography   \t\t\t\tToggle Discography subsection \t\t\t     6.1Si

In [37]:
len(plain_text_chunks)

28

In [38]:
plain_text_chunks[10]

'^ Moon, Soo-bin (December 16, 2017). [아.입.뽀] ⑥ 독보적 밀리언셀러, 방탄소년단 뷔 #이 비주얼, 이 실력 현실이냐? [[A.Ib.Ppo] ⑥ Unprecedented Million Seller, BTS V #Visual, is this real?]. Economy Asia (in Korean). Archived from the original on January 31, 2018. Retrieved July 9, 2020 – via Naver.  ^ a b Jang, Eun-kyung (April 30, 2015). [더스타프로필] 방탄소년단 뷔, 비범한 \'상남자\' 서열 "그림으로 말해요" [[The Star Profile] BTS V, extraordinary \'Boy In Luv\' ranking "Tell Me in Pictures"]. The Star (in Korean). Archived from the original on May 3, 2015. Retrieved July 9, 2020.  ^ a b Jang, Yoon-jung (October 30, 2018). [방탄소년단 멤버해부] 방탄소년단, ‘뷔’ #얼굴천재#감성소년#연탄이 아빠 [[BTS Dissection] BTS, \'V\' #Face Genius #Emotional Boy #Yeontan Dad]. AJU News (in Korean). Archived from the original on May 24, 2019. Retrieved August 12, 2019.  ^ "V from BTS – all about the K-pop group\'s vocalist and lead dancer". South China Morning Post. June 30, 2018. Archived from the original on June 30, 2018. Retrieved July 9, 2020.  ^ Drysdale, Jennifer (September 2

In [39]:
pinecone.init(api_key="a88f41cf-3264-48d8-8a04-953f06214d55", environment = "us-central1-gcp")
index_name = pinecone.Index('semantic-search-ai')
index_name

<pinecone.index.Index at 0x28645507d60>

In [40]:
index_name.describe_index_stats()

{'dimension': 381,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [41]:
def addData(corpusData,url):
    id  = index_name.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i]
        chunkInfo=(str(id+i),
                model.encode(chunk).tolist(),
                {'title': url,'context': chunk})
        index_name.upsert(vectors=[chunkInfo])

In [42]:
plain_text_chunks[20]

'"Sweet Night\': "Billboard K-Pop 100: Week of March 21, 2020". Billboard. Archived from the original on October 6, 2020. Retrieved November 4, 2022. "Christmas Tree": "Billboard K-pop 100 (Week of January 29, 2022)". Billboard. Archived from the original on March 3, 2022. Retrieved January 25, 2022.  ^ "Billboard Canadian Hot 100: January 8, 2022". Billboard. Archived from the original on January 5, 2022. Retrieved January 5, 2022.  ^ "Le Top de la semaine\xa0: Top Singles Téléchargés – SNEP (Week 21, 2018)" (in French). Syndicat National de l\'Édition Phonographique. Retrieved May 29, 2018.[permanent dead link]  ^ Compiled HUN sources: "Singularity" (in Hungarian). Slagerlistak. Archived from the original on June 1, 2020. Retrieved September 28, 2018. "Inner Child". Association of Hungarian Record Companies. March 6, 2020. Archived from the original on March 17, 2020. Retrieved March 5, 2020. "It\'s Definitely You". Association of Hungarian Record Companies. March 13, 2020. Archived 

In [44]:
addData(plain_text_chunks,url)

In [45]:
def find_match(query,k):
    query_em = model.encode(query).tolist()
    result = index_name.query(query_em, top_k=k, includeMetadata=True)
    
    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

* Using OpenAI

In [46]:
import openai
openai.api_key="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"
def create_prompt(context,query):
    header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' \n"
    return header + context + "\n" + query + "\n"

def generate_answer(prompt):
    response = openai.Completion.create(
    model="text-davinci-003",
    prompt=prompt,
    temperature=0,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    stop = [' END']
    )
    return (response.choices[0].text).strip()

In [47]:
query = "Who is Kim Tae-hyung?"
docs,res = find_match(query,1)

In [48]:
res

['Tools      Tools move to sidebar hide    \t\tActions \t  ReadView sourceView history     \t\tGeneral \t  What links hereRelated changesUpload fileSpecial pagesPermanent linkPage informationCite this pageWikidata item     \t\tPrint/export \t  Download as PDFPrintable version     \t\tIn other projects \t  Wikimedia Commons                      From Wikipedia, the free encyclopedia   South Korean singer (born 1995)   In this Korean name, the family name is  Kim. VV in June 2022BornKim Tae-hyung (1995-12-30) December 30, 1995 (age\xa027)Daegu, South KoreaEducationKorean Arts High SchoolGlobal Cyber UniversityOccupationsSingersongwriterAwards Hwagwan Order of Cultural Merit (2018)Musical careerGenresR&Bneo soulindie popK-popInstrument(s)VocalsYears active2013\xa0(2013)–presentLabelsBig HitMember ofBTS Musical artistKorean nameHangul김태형Hanja金泰亨Revised RomanizationGim Tae(-)hyeongMcCune–ReischauerKim T\'aehyŏngStage nameHangul뷔Revised RomanizationBwiMcCune–ReischauerPwi Signature Kim Tae-hy

In [49]:
context= "\n\n".join(res)
prompt = create_prompt(context,query)
print(prompt)

Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query' 
Tools      Tools move to sidebar hide    		Actions 	  ReadView sourceView history     		General 	  What links hereRelated changesUpload fileSpecial pagesPermanent linkPage informationCite this pageWikidata item     		Print/export 	  Download as PDFPrintable version     		In other projects 	  Wikimedia Commons                      From Wikipedia, the free encyclopedia   South Korean singer (born 1995)   In this Korean name, the family name is  Kim. VV in June 2022BornKim Tae-hyung (1995-12-30) December 30, 1995 (age 27)Daegu, South KoreaEducationKorean Arts High SchoolGlobal Cyber UniversityOccupationsSingersongwriterAwards Hwagwan Order of Cultural Merit (2018)Musical careerGenresR&Bneo soulindie popK-popInstrument(s)VocalsYears active2013 (2013)–presentLabel

### Semantic Search Using Quora CSV data

* Vector Database: Pinecone

In [50]:
import pandas as pd

In [51]:
df = pd.read_csv('questions.csv')

In [52]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [53]:
df[df['is_duplicate']==1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [59]:
index_name = 'semantic-search-ai'
document_store = pinecone.init(
    api_key="53b1ffd2-a56a-4649-9799-55f624b3f637",
    environment = "us-central1-gcp"
)

In [60]:
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeds[0]))
# connect to index
index = pinecone.Index(index_name)

In [61]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [62]:
embedding = model.encode("This is sentence")
len(embedding)

768

In [None]:
index_name.describe_index_stats()

In [63]:
question_list = []
for i,row in df.iterrows():
  question_list.append(
      (
        str(row['id']),
        model.encode(row['question1']).tolist(),
        {
            'is_duplicate': int(row['is_duplicate']),
            'question1': row['question1']
        }
      )
  )
  if len(question_list) == 50 or len(question_list) == len(df):
    index.upsert(vectors = question_list)
    question_list = []  

PineconeProtocolError: Failed to connect; did you specify the correct index name?

In [None]:
query = "How do I read and find my YouTube comments??"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k = 2, includeMetadata=True)
result

MaxRetryError: HTTPSConnectionPool(host='semantic-search-ai-unknown.svc.us-central1-gcp.pinecone.io', port=443): Max retries exceeded with url: /query (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002C541F44430>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))

In [None]:
query = "How to levergae internet for business"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k=2, includeMetadata=True)
result