### Semantic Search Using Quora CSV data

* Vector Database: Pinecone

In [1]:
import pandas as pd
import pinecone

  from tqdm.autonotebook import tqdm


In [2]:
df = pd.read_csv('questions.csv')

In [3]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
df[df['is_duplicate']==1].head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
11,11,23,24,How do I read and find my YouTube comments?,How can I see all my Youtube comments?,1
12,12,25,26,What can make Physics easy to learn?,How can you make physics easy to learn?,1
13,13,27,28,What was your first sexual experience like?,What was your first sexual experience?,1


In [5]:
index_name = 'semantic-search-ai'
document_store = pinecone.init(
    api_key="53b1ffd2-a56a-4649-9799-55f624b3f637",
    environment = "us-central1-gcp"
)

In [6]:
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeds[0]))
# connect to index
index = pinecone.Index(index_name)

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [8]:
embedding = model.encode("This is sentence")
len(embedding)

768

In [9]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8800}},
 'total_vector_count': 8800}

In [None]:
question_list = []
for i,row in df.iterrows():
  question_list.append(
      (
        str(row['id']),
        model.encode(row['question1']).tolist(),
        {
            'is_duplicate': int(row['is_duplicate']),
            'question1': row['question1']
        }
      )
  )
  if len(question_list) == 50 or len(question_list) == len(df):
    index.upsert(vectors = question_list)
    question_list = []  

In [11]:
query = "How do I read and find my YouTube comments??"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k = 2, includeMetadata=True)
result

{'matches': [{'id': '11',
              'metadata': {'is_duplicate': 1.0,
                           'question1': 'How do I read and find my YouTube '
                                        'comments?'},
              'score': 0.97971034,
              'values': []},
             {'id': '6468',
              'metadata': {'is_duplicate': 0.0,
                           'question1': 'I have just found out that a series '
                                        'of comments I wrote have been '
                                        'invisibly deleted. What do I do?'},
              'score': 0.452808589,
              'values': []}],
 'namespace': ''}

In [12]:
query = "How to levergae internet for business"
xq = model.encode([query]).tolist()
result = index.query(xq, top_k=2, includeMetadata=True)
result

{'matches': [{'id': '7850',
              'metadata': {'is_duplicate': 1.0,
                           'question1': 'How to start business on internet?'},
              'score': 0.663850427,
              'values': []},
             {'id': '5936',
              'metadata': {'is_duplicate': 0.0,
                           'question1': 'How can one start an internet '
                                        'broadband service provider company in '
                                        'India?'},
              'score': 0.5714643,
              'values': []}],
 'namespace': ''}