In [2]:
text = """Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[33] Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.

Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language. Python 3.0, released in 2008, was a major revision and not completely backward-compatible with earlier versions. Beginning with Python 3.5,[34] capabilities and keywords for typing were added to the language, allowing optional static typing.[35] Currently only versions in the 3.x series are supported.

Python has gained widespread use in the machine learning community.[36][37][38][39] It is widely taught as an introductory programming language.[40] Since 2003, Python has consistently ranked in the top ten of the most popular programming languages in the TIOBE Programming Community Index, which ranks based on searches in 24 platforms.[41]"""

In [3]:
import os
import requests
import json
import numpy as np

In [4]:
words = text.split()
len(words)



149

In [5]:
def chunk_text(text, chunk_size=50, overlap=10):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start +chunk_size,len(words))
        #print(end)
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
        start += chunk_size - overlap
        #print(start)

    return chunks

In [6]:
chunks = chunk_text(text)
chunks

['Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[33] Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. Guido van Rossum began working on Python in the late 1980s as',
 'Rossum began working on Python in the late 1980s as a successor to the ABC programming language. Python 3.0, released in 2008, was a major revision and not completely backward-compatible with earlier versions. Beginning with Python 3.5,[34] capabilities and keywords for typing were added to the language, allowing optional static',
 'for typing were added to the language, allowing optional static typing.[35] Currently only versions in the 3.x series are supported. Python has gained widespread use in the machine learning community.[36][37][38][39] It is widely taught as an int

In [None]:
API_URL = ""
API_KEY = ""
MODEL_NAME = "text-embedding-3-small"

In [8]:
header = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}"
}

In [10]:
all_embeddings= []
for i , chunk in enumerate(chunks):
    payload = {
        "model" : MODEL_NAME,
        "input" : chunk
    }
    response = requests.post(API_URL , headers=header, data=json.dumps(payload))
    data = response.json()
    embeddings = data['data'][0]['embedding']
    all_embeddings.append(embeddings)

In [13]:
all_embeddings

[[-0.009645044,
  0.010429922,
  0.054867707,
  -0.0014393819,
  0.028908793,
  -0.04955793,
  -0.046987325,
  0.030636577,
  0.0069848867,
  0.038790878,
  0.039675843,
  -0.00481462,
  -0.03830626,
  0.018647436,
  -0.010398315,
  -0.0012925465,
  -0.023704367,
  0.034703195,
  -0.0047382396,
  -0.026991373,
  0.023346167,
  0.02492646,
  -0.07256697,
  0.032680422,
  0.009334253,
  -0.02262977,
  0.010593218,
  0.06729933,
  -0.030868353,
  -0.02608534,
  -0.027159937,
  -0.024273273,
  -0.0076960176,
  0.048209414,
  -0.004545971,
  -0.07968882,
  0.037252728,
  -0.009060336,
  0.07732891,
  -0.01731999,
  0.0034898096,
  -0.038811952,
  -0.005196524,
  -0.024652543,
  -0.011947001,
  0.03917015,
  0.005012157,
  0.019827386,
  -0.010377245,
  0.010029581,
  -0.061778847,
  0.0034661053,
  0.02534787,
  0.020765025,
  -0.014696708,
  0.030151956,
  -0.02267191,
  0.016340211,
  -0.055499826,
  -0.0015144456,
  0.014054056,
  0.011420237,
  -0.036304556,
  0.03546173,
  -0.04677662,

In [14]:
len(all_embeddings)

4

In [15]:
type(all_embeddings)

list

In [17]:
embedding_array = np.array(all_embeddings, dtype= 'float32')

In [18]:
embedding_array

array([[-0.00964504,  0.01042992,  0.05486771, ..., -0.03522996,
        -0.01209449, -0.01288464],
       [-0.00463762,  0.00036974,  0.05396902, ..., -0.00385649,
        -0.00145984, -0.00640746],
       [-0.02087072,  0.00172635,  0.05309217, ..., -0.04098624,
         0.01367352, -0.00621888],
       [-0.00753574, -0.01355436,  0.04987649, ..., -0.0323577 ,
         0.0413275 ,  0.00493339]], shape=(4, 1536), dtype=float32)

In [23]:
import faiss
base_index = faiss.IndexFlatL2(1536)

In [24]:
base_index.add(embedding_array)

In [25]:
faiss.write_index(base_index, 'faiss_index.faiss')

In [29]:
query_text = "tell me about python"

In [34]:
def embedding_text(text):
    payload = {
        "model" : MODEL_NAME,
        "input" : chunk
    }
    response = requests.post(API_URL , headers=header, data=json.dumps(payload))
    data = response.json()
    embeddings = data['data'][0]['embedding']
    emb = np.array(embeddings,dtype="float32").reshape(1,-1)
    
    return emb

In [35]:
query_text_emb = embedding_text(query_text)

In [36]:
query_text_emb

array([[-0.00751584, -0.0135252 ,  0.04984717, ..., -0.03232312,
         0.04131776,  0.00490718]], shape=(1, 1536), dtype=float32)

In [37]:
base_index.search(query_text_emb,3)

(array([[9.5557402e-07, 7.0410144e-01, 9.2090791e-01]], dtype=float32),
 array([[3, 2, 0]]))

In [40]:
chunks[0]

'Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.[33] Python is dynamically type-checked and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. Guido van Rossum began working on Python in the late 1980s as'