In [13]:
# import packages

from tqdm.notebook import tqdm #progress bar
import pandas as pd
from typing import Optional, List, Tuple #type hinting
# from datasets import Dataset #to load in premade example datasets
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

from langchain_text_splitters import RecursiveCharacterTextSplitter #splitter

#langsmith setup
import os
import dotenv
dotenv.load_dotenv()

#load in Documents
from langchain.docstore.document import Document as LangchainDocument
# from langchain.text_splitter import RecursiveCharacterTextSplitter #alt import
from langchain_community.document_loaders import DirectoryLoader

from transformers import AutoTokenizer

# embedding and searching
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

#plotting
import pacmap
import numpy as np
import plotly.express as px

#for langsmith tracing
from langsmith import traceable
from langsmith.wrappers import wrap_openai

In [14]:
# commands to run in shell to set up environment variables (gitbash works best)

# export LANGCHAIN_TRACING_V2=true
# export LANGCHAIN_ENDPOINT="https://api.smith.langchain.com"
# export LANGCHAIN_API_KEY= ***replace with apikey***
# LANGCHAIN_API_KEY = os.getenv('LANGCHAIN_API_KEY') # to get api key from .env file
# LANGCHAIN_API_KEY
# export LANGCHAIN_PROJECT="IA-Higher-Ed"

# export OPENAI_API_KEY= ***replace with apikey***
# OPENAI_API_KEY = os.getenv('OPEN_AI_KEY') # to get api key from .env file
# OPENAI_API_KEY

In [15]:
import openai
from langsmith.wrappers import wrap_openai
from langsmith import traceable

# Auto-trace LLM calls in-context
client = wrap_openai(openai.Client(api_key=os.getenv("OPEN_AI_KEY")))

@traceable # Auto-trace this function
def pipeline(user_input: str):
    result = client.chat.completions.create(
        messages=[{"role": "user", "content": user_input}],
        model="gpt-3.5-turbo"
    )
    return result.choices[0].message.content

pipeline("Hello, world!")

'Hello! How can I assist you today?'

In [16]:
EMBEDDING_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    # model_kwargs={"device": "cuda"}, #using cpu when running locally - change if connecting to GPU for more speed
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

db = FAISS.load_local("faiss_index", embeddings=embedding_model, allow_dangerous_deserialization=True)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [17]:
# Embed a user query in the same space
user_query = "What is a decision tree?"
query_vector = embedding_model.embed_query(user_query)

In [18]:
# create pca projection of embeddings for visualization

embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

embeddings_2d = [
    list(db.index.reconstruct_n(idx, 1)[0]) for idx in range(db.index.ntotal)
] + [query_vector]

# Fit the data (the index of transformed data corresponds to the index of the original data)
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")





In [19]:
# vistualize pca projection

df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[x, 0],
            "y": documents_projected[x, 1],
            "source": db.docstore.search(i).metadata["source"],
            "extract": db.docstore.search(i).page_content[:100] + "...",
            "symbol": "circle",
            "size_col": 4,
        }
        for x, i in enumerate(list(db.index_to_docstore_id.values()))
    ]
    + [
        {
            "x": documents_projected[-1, 0],
            "y": documents_projected[-1, 1],
            "source": "User query",
            "extract": user_query,
            "size_col": 100,
            "symbol": "star",
        }
    ]
)

# Visualize the embedding
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()

In [20]:
retriever = db.as_retriever()

In [21]:
### RAG bot

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai
api_key = os.getenv('OPEN_AI_KEY')

class RagBot:

    def __init__(self, retriever, model: str = "gpt-3.5-turbo"):
        self._retriever = retriever
        # Wrapping the client instruments the LLM
        self._client = wrap_openai(openai.Client(api_key=api_key))
        self._model = model

    @traceable()
    def retrieve_docs(self, question):
        return self._retriever.invoke(question)

    @traceable()
    def invoke_llm(self, question, docs):
        response = self._client.chat.completions.create(
            model=self._model,
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful AI teaching assistant with expertise in Machine Learning."
                    " Use the following docs to produce a solution to the user question."
                    "If you do not know the answer, respond with \'Couldn't tell ya\' \n\n"
                    f"## Docs\n\n{docs}",
                },
                {"role": "user", "content": question},
            ],
        )

        # Evaluators will expect "answer" and "contexts"
        return {
            "answer": response.choices[0].message.content,
            "contexts": [doc for doc in docs],
        }

    @traceable()
    def get_answer(self, question: str):
        docs = self.retrieve_docs(question)
        return self.invoke_llm(question, docs)

rag_bot = RagBot(retriever)

In [22]:
response = rag_bot.get_answer("What package should I use to build a svm in Python?")
response["answer"]

'To build a Support Vector Machine (SVM) in Python, you can use the scikit-learn package. Scikit-learn is an open-source machine learning library in Python that contains a number of state-of-the-art machine learning algorithms, including SVM. It is widely used in industry and academia for machine learning tasks.\n\nIf you want to build an SVM model in Python using scikit-learn, you can refer to the scikit-learn documentation for detailed instructions and examples.'

In [23]:
print(response['answer'])

To build a Support Vector Machine (SVM) in Python, you can use the scikit-learn package. Scikit-learn is an open-source machine learning library in Python that contains a number of state-of-the-art machine learning algorithms, including SVM. It is widely used in industry and academia for machine learning tasks.

If you want to build an SVM model in Python using scikit-learn, you can refer to the scikit-learn documentation for detailed instructions and examples.


In [24]:
for source in response['contexts']:
    print(source.metadata['source'])

RAG-docs\processed\IntroToMLwithPython-MullerGuido.txt
RAG-docs\processed\IntroToMLwithPython-MullerGuido.txt
RAG-docs\processed\PythonML-Lee.txt
RAG-docs\processed\PythonDSHandbook-VanderPlas.txt
