# Keyword Search - Sparse Vector

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
# Sample documents
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [4]:
query = "keyword-based search"

# Documents Pre-Processing

In [5]:
import re

# Difference between the two expression

Expression 1: r'[^\w\s]'

Expression 2: r'[^a-zA-Z0-9\s]'

At first glance, they seem very similar, and functionally, they are mostly equivalent in this context. However, there's a subtle difference in how they define "word characters."

Expression 1 (r'[^\w\s]')

Uses \w which is a shorthand character class representing "word characters."
The definition of \w can depend on the locale and Unicode settings, but generally includes:
alphanumeric characters (a-z, A-Z, 0-9)
underscore (_)
Expression 2 (r'[^a-zA-Z0-9\s]')

Explicitly lists the allowed characters: a-z, A-Z, 0-9, and whitespace (\s).
Does not include the underscore character (_) like \w might.
Key Difference:

The main difference is that Expression 1 (\w) might include the underscore character, while Expression 2 does not. This means that if your text contains underscores and you want to remove them along with other punctuation, Expression 2 is the more precise option.

In most cases, the difference is negligible, and both expressions will effectively remove punctuation and special characters, leaving only letters, numbers, and spaces. However, if you need to be very specific about which characters are considered part of a "word," it's better to use the more explicit Expression 2.

I hope this clarifies the distinction between the two expressions. Let me know if you have any other questions.

In [7]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  # text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  return text

# Clean the docs and query

In [10]:
preprocess_documents = [preprocess_text(docs) for docs in documents]

In [11]:
preprocess_documents

['this is a list which containig sample documents',
 'keywords are important for keywordbased search',
 'document analysis involves extracting keywords',
 'keywordbased search relies on sparse embeddings']

In [12]:
preprocess_query = preprocess_text(query)

In [13]:
preprocess_query

'keywordbased search'

# Perform Tf-idf on both

In [14]:
sparse_vector = TfidfVectorizer()

In [15]:
X = sparse_vector.fit_transform(preprocess_documents)

In [16]:
y = sparse_vector.transform([preprocess_query])

In [18]:
X.toarray()

array([[0.        , 0.        , 0.37796447, 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.37796447],
       [0.        , 0.4533864 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
        0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
        0.        , 0.        , 0.35745504, 0.        , 0.        ,
        0.        ],
       [0.46516193, 0.        , 0.        , 0.46516193, 0.        ,
        0.        , 0.46516193, 0.        , 0.        , 0.46516193,
        0.        , 0.        , 0.36673901, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.43671931, 0.        , 0.        , 0.       

In [19]:
X.toarray()[0]

array([0.        , 0.        , 0.37796447, 0.        , 0.37796447,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.37796447, 0.        , 0.        , 0.37796447, 0.        ,
       0.        , 0.37796447, 0.        , 0.        , 0.37796447,
       0.37796447])

In [22]:
X.toarray()[1]

array([0.        , 0.4533864 , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.4533864 , 0.4533864 , 0.        ,
       0.        , 0.35745504, 0.35745504, 0.        , 0.        ,
       0.        , 0.        , 0.35745504, 0.        , 0.        ,
       0.        ])

In [21]:
y.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        ]])

# Calculate cosine similarity between each vector of document and query

In [23]:
similarity = cosine_similarity(X, y)

In [24]:
similarity

array([[0.        ],
       [0.50551777],
       [0.        ],
       [0.48693426]])

# Perform Ranking

In [29]:
np.argsort(similarity, axis=0) # sorts along first axis (down)

array([[0],
       [2],
       [3],
       [1]])

In [30]:
ranked_indices = np.argsort(similarity, axis=0)[::-1].flatten()

In [31]:
ranked_indices

array([1, 3, 2, 0])

In [32]:
ranked_documents = [documents[i] for i in ranked_indices]

In [33]:
ranked_documents

['Keywords are important for keyword-based search.',
 'Keyword-based search relies on sparse embeddings.',
 'Document analysis involves extracting keywords.',
 'This is a list which containig sample documents.']

In [37]:
query, ranked_documents

('keyword-based search',
 ['Keywords are important for keyword-based search.',
  'Keyword-based search relies on sparse embeddings.',
  'Document analysis involves extracting keywords.',
  'This is a list which containig sample documents.'])

# Vector Embeddings - Dense Vectors

Consider the below example documents embedding and query embeddings

In [38]:
document_embeddings = np.array([
    [0.634, 0.234, 0.867, 0.042, 0.249],
    [0.123, 0.456, 0.789, 0.321, 0.654],
    [0.987, 0.654, 0.321, 0.123, 0.456]
])

In [39]:
query_embedding = np.array([[0.789, 0.321, 0.654, 0.987, 0.123]])

# Calculate cosine similarity between each vector of document and query

In [40]:
X = document_embeddings
y = query_embedding

In [41]:
similarity = cosine_similarity(X, y)

In [42]:
similarity

array([[0.73558979],
       [0.67357898],
       [0.71517305]])

# Perform Ranking

In [43]:
ranked_indices = np.argsort(similarity, axis=0)[::-1].flatten()

In [44]:
ranked_indices

array([0, 2, 1])

In [45]:
ranked_documents = [documents[i] for i in ranked_indices]

In [46]:
ranked_documents

['This is a list which containig sample documents.',
 'Document analysis involves extracting keywords.',
 'Keywords are important for keyword-based search.']

# Using Hugging face model

In [48]:
! pip install langchain-huggingface sentence_transformers tiktoken

Collecting langchain-huggingface
  Using cached langchain_huggingface-0.1.0-py3-none-any.whl.metadata (1.3 kB)
Collecting sentence_transformers
  Using cached sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-core<0.4,>=0.3.0 (from langchain-huggingface)
  Downloading langchain_core-0.3.6-py3-none-any.whl.metadata (6.3 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4,>=0.3.0->langchain-huggingface)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<0.2.0,>=0.1.125 (from langchain-core<0.4,>=0.3.0->langchain-huggingface)
  Downloading langsmith-0.1.129-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-core<0.4,>=0.3.0->langchain-huggingface)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpointer>=1.9 (from js

In [49]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"

In [52]:
import os
from google.colab import userdata
os.environ['HF_TOKEN'] = userdata.get('HUGGINGFACEHUB_API_TOKEN')

In [54]:
embeddings = HuggingFaceEmbeddings(model_name=model_name)


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [55]:
documents = [
    "This is a list which containig sample documents.",
    "Keywords are important for keyword-based search.",
    "Document analysis involves extracting keywords.",
    "Keyword-based search relies on sparse embeddings."
]

In [56]:
query = "keyword-based search"

In [57]:
X = embeddings.embed_documents(documents)

In [58]:
y = embeddings.embed_query(query)

In [60]:
X

[[0.01575315371155739,
  -0.03363017365336418,
  0.007080064620822668,
  0.016350440680980682,
  -0.017385924234986305,
  0.01144037302583456,
  0.055652860552072525,
  0.04796215146780014,
  0.003187198657542467,
  -0.026203736662864685,
  0.024824701249599457,
  0.024402664974331856,
  0.03565405309200287,
  -0.0042030420154333115,
  0.003182720858603716,
  -0.038477879017591476,
  0.048956695944070816,
  0.02268010750412941,
  -0.015957308933138847,
  -0.019225982949137688,
  -0.032097190618515015,
  0.0422229990363121,
  -0.03296012058854103,
  0.020663829520344734,
  0.049400221556425095,
  -0.000551833538338542,
  -0.027008483186364174,
  -0.026209093630313873,
  -0.02719646692276001,
  -0.07043047994375229,
  -0.004723244812339544,
  0.011530621908605099,
  -0.00484183244407177,
  -0.06819989532232285,
  1.4977732689658296e-06,
  -0.030878843739628792,
  -0.024762295186519623,
  0.0035085577983409166,
  -0.02073226310312748,
  0.020177170634269714,
  0.02575898915529251,
  0.032

In [61]:
y

[0.051760170608758926,
 0.001765791210345924,
 -0.03220276162028313,
 -0.019635701552033424,
 -0.06191030144691467,
 -0.044859983026981354,
 -0.008550071157515049,
 0.07152492552995682,
 0.0010812608525156975,
 -0.030546434223651886,
 0.014690467156469822,
 -0.005119119305163622,
 -0.020266039296984673,
 0.05538710206747055,
 -0.036796845495700836,
 -0.038602039217948914,
 0.0234171524643898,
 0.02621816098690033,
 0.0004886813694611192,
 0.015232095494866371,
 0.012602495960891247,
 0.01865200884640217,
 0.018239444121718407,
 -0.004772841464728117,
 0.04190709814429283,
 0.03202025964856148,
 -0.06375866383314133,
 0.005033495370298624,
 -0.003865255042910576,
 0.0016242575366050005,
 -0.035264596343040466,
 -0.002522079274058342,
 -0.0012175581650808454,
 0.015601771883666515,
 1.3049825611233246e-06,
 -0.039032891392707825,
 -0.056905318051576614,
 0.019361166283488274,
 -0.0012410000199452043,
 0.04560914263129234,
 0.03702187538146973,
 0.04778227210044861,
 0.03676939010620117,
