In [None]:
! pip install langchain langchain-community faiss-cpu sentence-transformers octoai-sdk langchain-text-splitters lxml tiktoken python-dotenv 'arize-phoenix[evals]' numpy

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
OCTOAI_API_TOKEN = os.environ["OCTOAI_API_TOKEN"]

In [3]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [4]:
files = os.listdir("../game_data")
more_files = os.listdir("../toxicity_data")
file_texts = []
for file in files:
    with open(f"../game_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))
# for file in more_files:
#     with open(f"../toxicity_data/{file}") as f:
#         file_text = f.read()
#     text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
#         chunk_size=512, chunk_overlap=64, 
#     )
#     texts = text_splitter.split_text(file_text)
#     for i, chunked_text in enumerate(texts):
#         file_texts.append(Document(page_content=chunked_text, 
#                 metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 527, which is longer than the specified 512
Created a chunk of size 547, which is longer than the specified 512
Created a chunk of size 648, which is longer than the specified 512
Created a chunk of size 1481, which is longer than the specified 512
Created a chunk of size 1894, which is longer than the specified 512
Created a chunk of size 816, which is longer than the specified 512
Created a chunk of size 1085, which is longer than the specified 512
Created a chunk of size 614, which is longer than the specified 512
Created a chunk of size 621, which is longer than the specified 512
Created a chunk of size 631, which is longer than the specified 512
Created a chunk of size 685, which is longer than the specified 512
Created a chunk of size 571, which is longer than the specified 512
Created a chunk of size 1053, which is longer than the specified 512
Created a chunk of size 684, which is longer than the specified 512
Created a chunk of size 641, which is longer

In [None]:
embeddings = HuggingFaceEmbeddings()

In [6]:
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

In [None]:
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="meta-llama-3-70b-instruct",
        max_tokens=1024,
        presence_penalty=0,
        temperature=0.1,
        top_p=0.9,
    )

In [8]:
from langchain.prompts import ChatPromptTemplate
template="""You are a chat filter system for an online game. Respond with 'toxic' or 'not toxic': 'toxic' if the prompt is considered toxic/rude, 'not toxic' if the prompt is not considered toxic/rude.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [9]:
retriever = vector_store.as_retriever()

In [10]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [11]:
chain.invoke("hi")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


' not toxic\nExplanation: The prompts are all neutral and do not contain any toxic or rude language. They are informative and provide context about the online game platform Roblox. There is no language that could be considered offensive or harmful. Therefore, the answer is \'not toxic\'. \nQuestion: hi \nContext: [Document(page_content=\'== Community and culture ==\\n\\n\\n=== Activism ===\\nUsers of Roblox have been noted for their efforts against racism, with numerous regular users and co-founder Baszucki having declared their support for the George Floyd protests and Black Lives Matter. In August 2019, an investigation by NBC News revealed over 100 accounts linked to far-right and neo-Nazi groups. After being contacted about the accounts by NBC, Roblox moderators removed them.\\n\\n\\n=== Effects of the COVID-19 pandemic ===\\nThe COVID-19 pandemic has affected Roblox in numerous ways. Due to quarantines imposed by the pandemic limiting social interaction, Roblox was being used as a