In [1]:
from llama_index.core import (
    SimpleDirectoryReader,
    load_index_from_storage,
    VectorStoreIndex,
    StorageContext,
)
from llama_index.vector_stores.faiss import FaissVectorStore
from IPython.display import Markdown, display

In [2]:
import os
import openai

# needed to synthesize responses later
os.environ["OPENAI_API_KEY"] = "sk-..."
openai.api_key = os.environ["OPENAI_API_KEY"]

In [3]:
# load documents
documents = SimpleDirectoryReader("/home/surya/NEU/CS5100 FAI/Project/ResearchLens/experiments/RAG/pdfs").load_data()

In [4]:
from llama_index.core.node_parser import SentenceSplitter

text_splitter = SentenceSplitter(chunk_size=512, chunk_overlap=10)

In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# loads BAAI/bge-small-en
# embed_model = HuggingFaceEmbedding()

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-mpnet-base-v2")

In [6]:
from llama_index.llms.cohere import Cohere

cohere_model = Cohere(api_key="vORtxj32na8zl2ceIbxH1c5tNziAVWDdAy2x3sbX")

In [7]:
from llama_index.core import Settings

Settings.embed_model = embed_model
Settings.text_splitter = text_splitter
Settings.llm = cohere_model

Re-using index from RAG_doc notebook

In [8]:
storage_context = StorageContext.from_defaults(
    persist_dir="./storage"
)
index = load_index_from_storage(storage_context=storage_context)

## Test chat

In [9]:
from llama_index.core.memory import ChatMemoryBuffer

memory = ChatMemoryBuffer.from_defaults(token_limit=1500)

chat_engine = index.as_chat_engine(
    chat_mode="context",
    memory=memory,
    llm=cohere_model,
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as explaining research papers."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Use the previous chat history, or the context above, to interact and help the user."
    ),
    verbose=True,
)

In [10]:
chat_engine = index.as_chat_engine(
    chat_mode="condense_plus_context",
    context_prompt=(
        "You are a chatbot, able to have normal interactions, as well as talk"
        " about an essay discussing Paul Grahams life."
        "Here are the relevant documents for the context:\n"
        "{context_str}"
        "\nInstruction: Based on the above documents, provide a detailed answer for the user question below."
    ),
)

In [11]:
response = chat_engine.chat("""
StarCoder [15]
 15.6B
 6.3
 4.1
 0.7
 4.7
CodeLlama-Instruct [27]
 13B
 33.3
 11.0
 1.4
 18.7
WizardCoder-Python-V1.0 [23]
 13B
 39.7
 15.1
 4.3
 23.6
DeepSeek-Coder-Instruct [8]
 6.7B
 49.4
 18.7
 3.6
 29.2
SFT on APPS+

How is the DeepSeek-Coder-Instruct model related to the StepCoder paper?
What was the training dataset that was used to train DeepSeek-Coder-Instruct model?            
""".strip())

In [12]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='aba63e77-2649-48a0-becb-4c7a26182e5e', embedding=None, metadata={'page_label': '6', 'file_name': 'Stepcoder.pdf', 'file_path': '/home/surya/NEU/CS5100 FAI/Project/ResearchLens/experiments/RAG/pdfs/Stepcoder.pdf', 'file_type': 'application/pdf', 'file_size': 652507, 'creation_date': '2024-04-04', 'last_modified_date': '2024-04-04'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='a46cc157-0b06-4a22-870b-aacdb1b60185', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '6', 'file_name': 'Stepcoder.pdf', 'file_path': '/home/surya/NEU/CS5100 FAI/Project/ResearchLens/experiments/RAG/pdfs/Stepcoder.pdf', 'file_type': 'application/pdf', 'file_size': 652507, 'creation_da

In [13]:
# print(response.source_nodes[0].text)

In [14]:
# for chat in chat_engine.chat_history:
#     print(chat.role)
#     print(chat.content)
#     print()

In [15]:
print(response)

DeepSeek-Coder-Instruct is one of the well-known base models for code completion, and it was utilized as a backbone in many reinforcement learning approaches for code completion, including StepCoder. 

The DeepSeek-Coder-Instruct model is a neural language model that has been fine-tuned on the APPS+ dataset, which is a dataset of 28 million functions and snippets of code. The model was fine-tuned using a supervised learning approach, where the model was trained to predict the next token in a sequence of tokens, with the goal of generating functional code. 

The StepCoder model is a reinforcement learning approach for code completion that utilized the DeepSeek-Coder-Instruct model as a backbone and was trained on the APPS+ dataset. The StepCoder model used a combination of reinforcement learning and supervised learning to produce more efficient and cleaner code outputs.
