## set environment variables

In [37]:
import os
from dotenv import load_dotenv
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")

## libraries

In [38]:
from langchain_text_splitters import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

## load the text data

In [39]:
dir = "./violations_description_arabic"

texts = []

for text in os.listdir(dir):
    with open(f"{dir}/{text}", "r") as t:
        texts.append(t.read())

In [40]:
len(texts)

74

## splitting the texts into chunks

In [41]:
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=30,
    chunk_overlap=3,
    length_function=len,
    is_separator_regex=False,
)

In [42]:
text_chunks = text_splitter.create_documents(texts)
print(f"number of chunks: {len(text_chunks)}")

number of chunks: 74


In [43]:
print(text_chunks[0])

page_content='A car was detected with this license plate number Tr overtaking in طريق الملك فهد الفرعي road at 20:45:04 on 2024-09-23 at these coordinates: 24.764306, 46.646345'


### Embeddings and Storing in Pinecone

In [47]:
# Create Index
pc = Pinecone(api_key=PINECONE_API_KEY)

if 'violations-data-10' not in pc.list_indexes(): 
    pc.create_index(
        name='violations-data-10',
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(INDEX_NAME)


In [49]:
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY, model="text-embedding-ada-002")
PineconeVectorStore.from_documents(text_chunks, embeddings, index_name='violations-data-10')

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7fff1e995950>

In [50]:
help(PineconeVectorStore.from_documents)

Help on method from_documents in module langchain_core.vectorstores.base:

from_documents(documents: 'list[Document]', embedding: 'Embeddings', **kwargs: 'Any') -> 'VST' method of abc.ABCMeta instance
    Return VectorStore initialized from documents and embeddings.
    
    Args:
        documents: List of Documents to add to the vectorstore.
        embedding: Embedding function to use.
        kwargs: Additional keyword arguments.
    
    Returns:
        VectorStore: VectorStore initialized from documents and embeddings.



In [9]:
history = {'user': 'qeury', 'response': 'answer'}

for i in range(10):
    history.append(f"text {i}")


with open("history.txt", "w") as f:
    f.write("\n".join(history))

In [20]:
history = {'user': 'qeury', 'response': 'answer'}

history['user'] = 'hi 1'
history['response'] = 'yes 1'

history['user'] = 'hi 2'
history['response'] = 'yes 2'

history['user'] = 'hi 3'
history['response'] = 'yes 3'

In [25]:
help(history)

Help on dict object:

class dict(object)
 |  dict() -> new empty dictionary
 |  dict(mapping) -> new dictionary initialized from a mapping object's
 |      (key, value) pairs
 |  dict(iterable) -> new dictionary initialized as if via:
 |      d = {}
 |      for k, v in iterable:
 |          d[k] = v
 |  dict(**kwargs) -> new dictionary initialized with the name=value pairs
 |      in the keyword argument list.  For example:  dict(one=1, two=2)
 |  
 |  Built-in subclasses:
 |      StgDict
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key, /)
 |      True if the dictionary has the specified key, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>va

In [22]:
history.update({'user': 'hi 1', 'response': 'yes 1'})

history.update({'user': 'hi2', 'response': 'yes 2'})

history.update({'user': ' hi 3', 'response': 'yes 3'})

In [23]:
print(history)

{'user': ' hi 3', 'response': 'yes 3'}
