# NER based Semantic Search

## Environment setup

In [1]:
#load env variables
import os
from dotenv import load_dotenv

load_dotenv()


True

## PineCone DB Setup

In [None]:
#PineCone client Setup
from pinecone import Pinecone

PineCone_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PineCone_API_KEY)

## Creating Index in PineCone
#pc.create_index(name="medium-data", dimension=768, spec= ServerlessSpec(cloud="aws", region="us-east-1"))

In [3]:
#Set pointer to createindex
idx = pc.Index("medium-data")

  from .autonotebook import tqdm as notebook_tqdm


## NER Engine

In [4]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch

In [5]:
#NER ENgine
model_id = 'dslim/bert-base-NER'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForTokenClassification.from_pretrained(model_id)

#NLP Pipeline
nlp = pipeline('ner',
                model=model,
                tokenizer=tokenizer,
                aggregation_strategy= 'max',
                device = 'cpu')


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


## Retriever

In [6]:
from sentence_transformers import SentenceTransformer

In [7]:
retriever = SentenceTransformer("flax-sentence-embeddings/all_datasets_v3_mpnet-base")

In [8]:
retriever

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False, 'architecture': 'MPNetModel'})
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

## Obtain Raw Data

In [9]:
from datasets import load_dataset
import pandas as pd

In [10]:
df = pd.read_csv("./medium_articles_10k.csv")
df.shape

(10000, 7)

In [11]:
df = df.drop(["Unnamed: 0"], axis = 1)
df.dropna(inplace=True)
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci..."
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P..."
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We..."
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P..."
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology..."


In [12]:
df['text_extended'] = df['title'] + '.' + df['text'].str[:1000]

In [13]:
df.head()

Unnamed: 0,title,text,url,authors,timestamp,tags,text_extended
0,Mental Note Vol. 24,Photo by Josh Riemer on Unsplash\n\nMerry Chri...,https://medium.com/invisible-illness/mental-no...,['Ryan Fan'],2020-12-26 03:38:10.479000+00:00,"['Mental Health', 'Health', 'Psychology', 'Sci...",Mental Note Vol. 24.Photo by Josh Riemer on Un...
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['Mental Health', 'Coronavirus', 'Science', 'P...",Your Brain On Coronavirus.Your Brain On Corona...
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['Biotechnology', 'Neuroscience', 'Brain', 'We...",Mind Your Nose.Mind Your Nose\n\nHow smell tra...
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['Health', 'Neuroscience', 'Mental Health', 'P...",The 4 Purposes of Dreams.Passionate about the ...
4,Surviving a Rod Through the Head,"You’ve heard of him, haven’t you? Phineas Gage...",https://medium.com/live-your-life-on-purpose/s...,['Rishav Sinha'],2020-02-26 00:01:01.576000+00:00,"['Brain', 'Health', 'Development', 'Psychology...",Surviving a Rod Through the Head.You’ve heard ...


## Extract NER

In [14]:
df_batch = df['text_extended'].iloc[0:10]

In [15]:
df_batch

0    Mental Note Vol. 24.Photo by Josh Riemer on Un...
1    Your Brain On Coronavirus.Your Brain On Corona...
2    Mind Your Nose.Mind Your Nose\n\nHow smell tra...
3    The 4 Purposes of Dreams.Passionate about the ...
4    Surviving a Rod Through the Head.You’ve heard ...
5    Mentally, Young Adults Are Suffering Most From...
6    How to Turn Your Popular Blog Series Into a Be...
7    Dr Faisal Dar — Pioneer of Liver Transplantati...
8    Sunlight — The Natural Supplement For Our Ment...
9    Occam’s dice.Occam’s dice\n\nDistrusting biolo...
Name: text_extended, dtype: object

In [16]:
def extract_entities(list_of_text):
    entities = []
    for doc in list_of_text:
        entities.append([item['word'] for item in nlp(doc)])

    return entities

## Batch Vector embeddings

In [17]:
emb = retriever.encode(df_batch).tolist()

## Upsert Data into DB

In [22]:
from tqdm.auto import tqdm

batch_size = 64
for i in range(0, 2000, batch_size):
    i_end = min(i+batch_size, len(df))

    #batch of data
    df_batch = df.iloc[i: i_end]

    #Embeddings
    emb = retriever.encode(df_batch['text_extended'].tolist()).tolist()

    #NER Extraction
    entities = extract_entities(df_batch['text_extended'].tolist())
    
    #Remove duplicates for each entities list
    df_batch['ner'] = [list(set(entity)) for entity in entities]

    #Create metadata
    df_batch = df_batch.drop(['text'], axis = 1)
    meta_data = df_batch.to_dict(orient='records')

    #Create_indices
    ids = [str(id) for id in range(i, i_end)]

    #Upsert
    vectors_to_upsert = list(zip(ids, emb, meta_data))
    idx.upsert(vectors=vectors_to_upsert)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['ner'] = [list(set(entity)) for entity in entities]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['ner'] = [list(set(entity)) for entity in entities]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_batch['ner'] = [list(set(entity)) for entity in entities]
A value is trying to b

In [23]:
idx.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 2048}},
 'total_vector_count': 2048,
 'vector_type': 'dense'}

## Semantic Search - Querying

In [28]:
query = "We live on planet Earth"
#Embedded Query Vector
emb_qx = retriever.encode(query).tolist()

ne = extract_entities([query])[0]
ne

['Earth']

In [31]:
xc = idx.query(vector=emb_qx, top_k=2, include_metadata=True, filter={"ner": {"$in": ne}})


In [32]:
for result in xc['matches']:
    print(result['score'], " ", result['metadata']['ner'] )

0.322942764   ['Earth', 'Pekanbaru', 'Indonesia', 'Made', 'Weizmann Institute of Science', 'Israel', 'Nature', 'Human', 'Siak River', 'Barcroft Media / Getty Images']
0.30353868   ['Space', 'Earth', 'Great Barrier Reef', 'Kessler Syndrome']
