<a href="https://colab.research.google.com/github/Rodkymo/VideoService-1/blob/Faiss/faiss_similarity_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install faiss-gpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4
Collecting sentence-transformers
  Downloading sentence_transformers-2.4.0-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.5/149.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.4.0


In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

In [None]:
# Define data
data = [
    ['Where are your headquarters located?', 'location'],
    ['Throw my cellphone in the water', 'random'],
    ['Network Access Control?', 'networking'],
    ['Address', 'location'],
    ['What is the capital of France?', 'location'],
    ['How do I reset my password?', 'random'],
    ['What are the different types of network topologies?', 'networking'],
    ['How do I find the nearest post office?', 'location'],
    ['Is it going to rain tomorrow?', 'random'],
    ['How does a router work?', 'networking']
]

data

[['Where are your headquarters located?', 'location'],
 ['Throw my cellphone in the water', 'random'],
 ['Network Access Control?', 'networking'],
 ['Address', 'location'],
 ['What is the capital of France?', 'location'],
 ['How do I reset my password?', 'random'],
 ['What are the different types of network topologies?', 'networking'],
 ['How do I find the nearest post office?', 'location'],
 ['Is it going to rain tomorrow?', 'random'],
 ['How does a router work?', 'networking']]

In [None]:
# Create DataFrame
df = pd.DataFrame(data, columns=['text', 'category'])

df

Unnamed: 0,text,category
0,Where are your headquarters located?,location
1,Throw my cellphone in the water,random
2,Network Access Control?,networking
3,Address,location
4,What is the capital of France?,location
5,How do I reset my password?,random
6,What are the different types of network topolo...,networking
7,How do I find the nearest post office?,location
8,Is it going to rain tomorrow?,random
9,How does a router work?,networking


In [None]:
# Encode text using SentenceTransformer
text = df['text']
encoder = SentenceTransformer("paraphrase-mpnet-base-v2")
vectors = encoder.encode(text)
vectors

array([[-0.00437621, -0.06387778, -0.11621017, ...,  0.09268748,
        -0.04782989, -0.05484067],
       [-0.01245356, -0.26505408, -0.04051869, ...,  0.14551333,
         0.13894194,  0.10772914],
       [-0.01477675,  0.05492527,  0.03377842, ...,  0.08851662,
        -0.1465777 , -0.02152413],
       ...,
       [-0.10135231,  0.04360763,  0.01513361, ...,  0.04636067,
        -0.16322236,  0.09010948],
       [-0.11150391, -0.1576652 , -0.07323896, ...,  0.03968367,
        -0.29139358,  0.07139608],
       [ 0.04221755, -0.11480591, -0.04553395, ...,  0.04525404,
        -0.04773113, -0.1421876 ]], dtype=float32)

In [None]:
# Build Faiss index
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7e0c721437b0> >

In [None]:

# Define search text and encode it
search_text = 'population power?'
search_vector = encoder.encode(search_text)
search_vector = np.array([search_vector])
faiss.normalize_L2(search_vector)
search_vector


array([[-2.38286108e-02,  3.92745249e-02,  5.03219152e-03,
        -5.76866679e-02, -2.94243228e-02,  3.64530981e-02,
        -2.21687797e-02,  1.52550160e-03, -3.85873485e-03,
         4.51068059e-02,  3.12195141e-02, -2.77743954e-02,
        -1.66370645e-02, -2.24589147e-02,  5.27514750e-03,
        -4.37177345e-02,  8.56414251e-03,  1.20029459e-02,
        -4.07560989e-02,  2.98229046e-03, -9.52331871e-02,
        -6.10657968e-02,  2.73270626e-02,  1.79954153e-02,
         6.43825606e-02,  3.37163769e-02, -2.49297563e-02,
         1.45404190e-02, -1.65124554e-02,  1.29870803e-03,
         3.92481461e-02,  9.58162080e-03,  7.95231573e-03,
        -6.74953684e-03,  2.71345899e-02, -8.71761516e-03,
        -3.14925686e-02,  3.65946032e-02, -3.42525356e-02,
         1.27314432e-02,  3.30000035e-02,  3.77474315e-02,
        -3.72128524e-02,  3.36678326e-02, -3.38462405e-02,
        -4.73176478e-04,  3.03016752e-02, -5.08212075e-02,
         1.48880165e-02, -6.01863042e-02,  2.18051748e-0

In [None]:
# Perform similarity search
k = index.ntotal
distances, ann = index.search(search_vector, k=k)
k

10

In [None]:

# Create DataFrame for results
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
results

Unnamed: 0,distances,ann
0,1.704991,0
1,1.731435,2
2,1.741611,4
3,1.762439,3
4,1.789176,6
5,1.853529,1
6,1.860535,8
7,1.909992,9
8,1.951715,7
9,2.066961,5


In [None]:
# Merge results with original DataFrame
merge = pd.merge(results, df, left_on='ann', right_index=True)
merge

Unnamed: 0,distances,ann,text,category
0,1.704991,0,Where are your headquarters located?,location
1,1.731435,2,Network Access Control?,networking
2,1.741611,4,What is the capital of France?,location
3,1.762439,3,Address,location
4,1.789176,6,What are the different types of network topolo...,networking
5,1.853529,1,Throw my cellphone in the water,random
6,1.860535,8,Is it going to rain tomorrow?,random
7,1.909992,9,How does a router work?,networking
8,1.951715,7,How do I find the nearest post office?,location
9,2.066961,5,How do I reset my password?,random


In [None]:
labels = df['category']
category = labels[ann[0][0]]
category

'location'