<a href="https://colab.research.google.com/github/SamuelDuong/JackBlog.github.io/blob/master/Implementing_a_Retrieval_Augmented_Generation_(RAG)_System_with_OpenAI's%C2%A0API_IMDB_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Implementing a Retrieval-Augmented Generation (RAG) System with OpenAI's API**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Import the libraries, which we are going to use in this implementation.

In [None]:
!pip install langchain
!pip install openai
!pip install tiktoken
!pip install faiss-gpu
!pip install langchain_experimental
!pip install "langchain[docarray]"



In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.indexes import VectorstoreIndexCreator
from langchain_experimental.agents.agent_toolkits.csv.base import create_csv_agent
from langchain.agents.agent_types import AgentType
from langchain.memory import ConversationBufferMemory
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import FAISS
import tiktoken

In [None]:
import os

# Prompt the user for their OpenAI API key
api_key = input("Please enter your OpenAI API key: ")

# Set the API key as an environment variable
os.environ["OPENAI_API_KEY"] = api_key

# Optionally, check that the environment variable was set correctly
print("OPENAI_API_KEY has been set!")


Please enter your OpenAI API key: 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
OPENAI_API_KEY has been set!


In [None]:
llm_model = "gpt-3.5-turbo"

In [None]:
import logging
from typing import List, Optional

import pandas as pd

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.helpers import detect_file_encodings

logger = logging.getLogger(__name__)

class TextLoader(BaseLoader):
    """Load text data from a file.

    Args:
        file_path: Path to the file to load.
        encoding: File encoding to use. If None, the file will be loaded with the default system encoding.
        autodetect_encoding: Whether to try to autodetect the file encoding if the specified encoding fails.
        sheet_name: Name of the Excel sheet to read data from. Applicable only if the file is in Excel format.
        text_column: Name of the column containing text data. Applicable only if the file is in Excel format.
    """

    def __init__(
        self,
        file_path: str,
        encoding: Optional[str] = None,
        autodetect_encoding: bool = False,
        sheet_name: Optional[str] = None,
        text_column: Optional[str] = None
    ):
        """Initialize with file path and optional parameters."""
        self.file_path = file_path
        self.encoding = encoding
        self.autodetect_encoding = autodetect_encoding
        self.sheet_name = sheet_name
        self.text_column = text_column

    def load(self) -> List[Document]:
        """Load text data from the file."""
        text = ""
        try:
            if self.file_path.endswith('.xlsx'):
                data = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
                text = "\n".join(data[self.text_column].astype(str))
            else:
                with open(self.file_path, encoding=self.encoding) as f:
                    text = f.read()
        except UnicodeDecodeError as e:
            if self.autodetect_encoding and not self.file_path.endswith('.xlsx'):
                detected_encodings = detect_file_encodings(self.file_path)
                for encoding in detected_encodings:
                    logger.debug(f"Trying encoding: {encoding.encoding}")
                    try:
                        with open(self.file_path, encoding=encoding.encoding) as f:
                            text = f.read()
                        break
                    except UnicodeDecodeError:
                        continue
            else:
                raise RuntimeError(f"Error loading {self.file_path}") from e
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        metadata = {"source": self.file_path}
        return [Document(page_content=text, metadata=metadata)]


# Document Loading & Transformers


In [None]:
data

[Document(page_content='Title: Red Light Special Genres: ["Adult"] Release Date: March 27, 2017 (United States) IMDb Rating: Unknown/10 from Unknown users Awards: Unknown Critics Reviews: Unknown Episode Count: Unknown User Reviews: Unknown total, Average Rating: Unknown Featured Review: Unknown Box Office Budget: Unknown ---  Title: Salongen med Margreth Olin og valpen Juno Genres: Unknown Release Date: May 21, 2017 (Norway) IMDb Rating: Unknown/10 from Unknown users Awards: Unknown Critics Reviews: Unknown Episode Count: Unknown User Reviews: Unknown total, Average Rating: Unknown Featured Review: Unknown Box Office Budget: Unknown ---  Title: Alle kan gjere litt ved Kristin L??d??en Hope Genres: Unknown Release Date: December 11, 2019 (Norway) IMDb Rating: Unknown/10 from Unknown users Awards: Unknown Critics Reviews: Unknown Episode Count: Unknown User Reviews: Unknown total, Average Rating: Unknown Featured Review: Unknown Box Office Budget: Unknown ---  Title: Episode 8: Steve Za

In [None]:

file_path = "/datalab/imdb_info_cleaned.txt"
encoding = 'latin-1'

loader = TextLoader(file_path=file_path, encoding=encoding)
data = loader.load()

chunk_size = 1000
chunk_overlap = 200

text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
data = text_splitter.split_documents(data)

db = FAISS.from_documents(data, OpenAIEmbeddings())
retriever = db.as_retriever()


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
!pip install faiss-cpu
!pip install langchain
from langchain.embeddings import OpenAIEmbeddings



# **Similarity search by vector**

In [None]:
query = "When was the movie Magnets Genres released"
docs = await db.asimilarity_search(query)
print(docs[0].page_content)

In [None]:
query = "What is the imdb rating for movie title Abdullah"
docs = await db.asimilarity_search(query)
print(docs[0].page_content)

In [None]:
query = "What type of genre for John Wick "
docs = await db.asimilarity_search(query)
print(docs[0].page_content)


# **Similarity score threshold retrieval**

We can also set a retrieval method that sets a similarity score threshold and only returns documents with a score above that threshold.

In [None]:
docs = retriever.get_relevant_documents("what was the review writen about ketanji brown jackson")

