## Creating an index and populating it with documents using Redis

Simple example on how to ingest PDF documents, then web pages content into a Redis VectorStore.

Requirements:
- A Redis cluster
- A Redis database with at least 2GB of memory (to match with the initial index cap)

In [None]:
!pip install langchain boto3 botocore sentence-transformers redis pypdf

In [None]:
# The below code assumes a s3 Data connection to the said bucket is established
# and is the standar environment variables are injected to this notebook.

import boto3
import os
import botocore

# Accessing the environment variable set via Data Connection
AWS_S3_BUCKET = os.environ.get('AWS_S3_BUCKET')
FOLDER_PREFIX = os.environ.get('FOLDER_PREFIX')

if AWS_S3_BUCKET is None or FOLDER_PREFIX is None:
    raise ValueError("One or more environment variables are not set")

print(f"{AWS_S3_BUCKET}{FOLDER_PREFIX}")

# Function to download a folder from S3 without checking for existing files
def download_folder_from_s3(bucket_name, folder_prefix, local_path):
    s3 = boto3.client('s3')
    paginator = s3.get_paginator('list_objects_v2')
    for result in paginator.paginate(Bucket=bucket_name, Prefix=folder_prefix):
        if 'Contents' in result:
            for obj in result['Contents']:
                key = obj['Key']
                if key.endswith('/'):  # Skip directories
                    continue
                dest_file_path = os.path.join(local_path, key[len(folder_prefix):])
                try:
                    os.makedirs(os.path.dirname(dest_file_path), exist_ok=True)
                    s3.download_file(bucket_name, key, dest_file_path)
                    print(f"Downloaded: {key} to {dest_file_path}")
                except botocore.exceptions.ClientError as e:
                    print(f"Error downloading {key}: {e}")

# Define the folder prefix and local directory
LOCAL_DIRECTORY = FOLDER_PREFIX

# Download the folder from S3
download_folder_from_s3(AWS_S3_BUCKET, FOLDER_PREFIX, LOCAL_DIRECTORY)

### Base parameters, the Redis info

In [None]:
# Access the environment variables
db_pass = os.getenv("DB_PASS")
port = os.getenv("PORT")
service_name = os.getenv("SERVICE_NAME")
redis_namespace = os.getenv("REDIS_NAMESPACE")

# Check if any variable is None or empty string
if db_pass is None or port is None or service_name is None or redis_namespace is None:
    raise ValueError("One or more environment variables are not set")

# Alternatively, you can check if any variable is an empty string
if not db_pass or not port or not service_name or not redis_namespace:
    raise ValueError("One or more environment variables are empty")

redis_url = f"redis://default:{db_pass}@{service_name}.{redis_namespace}.svc.cluster.local:{port}"
print(redis_url)
index_name = "docs"

#### Imports

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.redis import Redis

## Initial index creation and document ingestion

#### Document loading from a folder containing PDFs

In [None]:
pdf_folder_path = "rhoai-docs"

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(docs)

#### Create the index and ingest the documents

In [None]:
embeddings = HuggingFaceEmbeddings()
rds = Redis.from_documents(all_splits,
                           embeddings,
                           redis_url=redis_url,
                           index_name=index_name)

#### Write the schema to a yaml file to be able to open the index later on

In [None]:
rds.write_schema("redis_schema.yaml")

## Ingesting new documents

#### Example with Web pages

In [None]:
from langchain.document_loaders import WebBaseLoader

In [None]:
loader = WebBaseLoader(["https://ai-on-openshift.io/getting-started/openshift/",
                        "https://ai-on-openshift.io/getting-started/opendatahub/",
                        "https://ai-on-openshift.io/getting-started/openshift-data-science/",
                        "https://ai-on-openshift.io/odh-rhods/configuration/",
                        "https://ai-on-openshift.io/odh-rhods/custom-notebooks/",
                        "https://ai-on-openshift.io/odh-rhods/nvidia-gpus/",
                        "https://ai-on-openshift.io/odh-rhods/custom-runtime-triton/",
                        "https://ai-on-openshift.io/odh-rhods/openshift-group-management/",
                        "https://ai-on-openshift.io/tools-and-applications/minio/minio/"
                       ])

In [None]:
data = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits = text_splitter.split_documents(data)

In [None]:
embeddings = HuggingFaceEmbeddings()
rds = Redis.from_existing_index(embeddings,
                                redis_url=redis_url,
                                index_name=index_name,
                                schema="redis_schema.yaml")

In [None]:
rds.add_documents(all_splits)