In [None]:
# !pip install hopsworks
# !pip install langchain
# !pip install tiktoken
# !pip install openai
# !pip install hsfs

In [55]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.document_loaders import TextLoader, DirectoryLoader
import os
import hopsworks
import pandas as pd

In [36]:
# Set the OpenAI API key as an environment variable
OPENAI_API_KEY = "your_api_key_here"
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

### Loading the documents using DirectoryLoader

In [22]:
def load_documents():
    loader = DirectoryLoader('Elements', loader_cls=TextLoader)
    docs = loader.load()
    print("Number of documents is ",len(docs))
    return docs

Number of documents is  3


### Creating chunks on the loaded documents

In [17]:
def create_chunks(content,chunk_size=250,chunk_overlap=10):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(content)
    return texts

### Create embeddings for the chunked data

In [52]:
def get_embedding(chunk_data, embedding_object):
    embedded_chunk = embedding_object.embed_query(chunk_data)
    return embedded_chunk

### Driver 

In [None]:
docs = load_documents()
split_data = create_chunks(docs,chunk_size=750,chunk_overlap=50)
embedding_object = OpenAIEmbeddings(model="text-embedding-ada-002")
embedded_data = [get_embedding(chunk.page_content, embedding_object) for chunk in split_data]

In [69]:
# Each embedding has a fixed length of 1536 as it is the OpenAI standard
len(embedded_data[0])

1536

### Logging to Hopsworks

In [79]:
hopsworks_project = hopsworks.login()
fs = hopsworks_project.get_feature_store()

Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/105630
Connected. Call `.close()` to terminate connection gracefully.


In [73]:
# Creating the dataframe which is to be uploaded to the feature group
data = {
    "text" : [chunk.page_content for chunk in split_data],
    "embeddings" : embedded_data
}
df = pd.DataFrame(data)
df['p_key'] = [i for i in range(1,len(df)+1)]

In [81]:
# Create a feature group
# Provide a name, version, description and the primary key column
openai_embedding_fg = fs.get_or_create_feature_group(
		name="openai_embedding",
    version=1,
    description="Embedding data for elements",
    primary_key=['p_key'],
    online_enabled=True)

In [None]:
# Insert the created dataframe to the recently created feature store.
# We will get to know the status of the insertion via the link provided
openai_embedding_fg.insert(df)

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/105630/fs/105549/fg/109033


Uploading Dataframe: 0.00% |          | Rows 0/37 | Elapsed Time: 00:00 | Remaining Time: ?

Launching job: openai_embedding_1_offline_fg_backfill
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai/p/105630/jobs/named/openai_embedding_1_offline_fg_backfill/executions


(<hsfs.core.job.Job at 0x78dd0d6b8160>, None)

In [None]:
# Updating the Feature Descriptions
# This step is for providing metadata information about our features

feature_descriptions = [
    {"name": "text", "description": "The chunks for which the embeddings are created"},
    {"name": "embeddings", "description": "The embeddings for the chunks"},
    {"name": "p_key", "description": "This feature is used as a primary key"},
]

for desciption in feature_descriptions:
    openai_embedding_fg.update_feature_description(desciption["name"], desciption["description"])

### Reading the stored embeddedings 

In [None]:
import hsfs

In [None]:
# Connecting to hopsworks feature stores and reading the data from the feature group which we had created earlier
connection = hsfs.connection()
fs = connection.get_feature_store(name='practice_featurestore')
fg = fs.get_feature_group('openai_embedding', version=1)

In [None]:
df = fg.read()