In [19]:
import re
import unicodedata

def process_documents(documents):
    '''
    @param documents: List of documents.
    @return: a long sting. Concatenated text from document.
    '''
    doc_text = ''
    for doc in documents:

        text = doc.page_content
        # Remove unknown characters
        text = ''.join(c for c in text if unicodedata.category(c) != 'Co')
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove non-alphanumeric characters
        text = re.sub(r'[^a-zA-Z0-9\s,.]', '', text)
        # Convert to lowercase
        text = text.lower()
        doc_text += text

    return doc_text

In [20]:
from langchain.document_loaders import PyPDFLoader
# Reading pdfs
def read_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load_and_split()
    return documents

In [21]:
#
documents = read_pdf('release engineering.pdf')
processed_text = process_documents(documents)
#
print(len(documents))
print(processed_text)

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


3
1. summarize the 6 phases of release engineering discussed in the article and explain the role each phase plays in the overall release process. 12 sentences per phase is sufficient 1 integration branching and merging this phase involves merging code changes from individual development branches into the teams branch and eventually into the projects master branch, ensuring smooth collaboration and code synchronization. 2 continuous integration building and testing here, code changes are continuously integrated into the mainline codebase. this process involves automatically building and testing new commits or merges to identify any regressions promptly. 3 build system the build system encompasses the specifications used to generate project deliverables such as binaries or packages from the source code. its crucial for ensuring consistency and reliability in the build process. 4 infrastructureascode this phase involves defining and managing infrastructure using code, allowing for automat

### Perform Native Chunking(RecursiveCharacterTextSplitting)

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
def recursive_character_text_splitter(text):
    '''
    @param text: a long string
    @return: a list of strings. Each string is a chunk of the text.
    '''

    chunk_size = 500

    if len(text) > 2000:
        chunk_size = 800

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, # split into chunks of 100 characters
        chunk_overlap=20, # overlap by 20 characters
        length_function=len, 
        separators=["\n\n", "\n","(?<=\. )", " ", ""], # split by new line, space, and period
        is_separator_regex=True, # use regex for separators
    )
    texts = text_splitter.split_text(text)
    return texts

In [23]:
naive_chunks = recursive_character_text_splitter(processed_text)
for chunk in naive_chunks[:15]:
  print(chunk+ "\n")

1. summarize the 6 phases of release engineering discussed in the article and explain the role each phase plays in the overall release process. 12 sentences per phase is sufficient 1 integration branching and merging this phase involves merging code changes from individual development branches into the teams branch and eventually into the projects master branch, ensuring smooth collaboration and code synchronization. 2 continuous integration building and testing here, code changes are continuously integrated into the mainline codebase. this process involves automatically building and testing new commits or merges to identify any regressions promptly.

3 build system the build system encompasses the specifications used to generate project deliverables such as binaries or packages from the source code. its crucial for ensuring consistency and reliability in the build process. 4 infrastructureascode this phase involves defining and managing infrastructure using code, allowing for automate

### Instantiate Embedding Model

In [24]:
import cohere
def embed(docs):
    
    co = cohere.Client(api_key="fAZMudlG7f2M0CtUYWrQikcrxzzRoPze5UbW81nA")
    embeds = co.embed(texts=docs, model='embed-english-v3.0',input_type='search_document').embeddings

    # model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    # embeds = model.encode(docs)

    # model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m", trust_remote_code=True)
    # embeds = model.encode(docs)
    return embeds

In [25]:
import numpy as np
def upSertEmbeds(processed_text, index):
    '''
    @param processed_text: [[text1], [text2], ...]
    @param index: Pinecone index
    '''
    embeds = embed(processed_text)
    shape = np.array(embeds).shape
    vectors = []

    for i in range(shape[0]):
        vector = {'id': str(i),
                  'values': embeds[i],
                  'metadata': {'text': processed_text[i]}
                 }
        vectors.append(vector)

    index.upsert(vectors)

In [26]:
#
from pinecone import Pinecone as PineconeClient

pc = PineconeClient(api_key="1bba0667-e178-4139-a1a6-34f805bae975")
naive_index = pc.Index(name="test2") # you have to change the index when you run the code again
upSertEmbeds(naive_chunks, naive_index)

### Setup the API Key for LLM

In [27]:
# from langchain_cohere import ChatCohere
# client = ChatCohere(
#                 cohere_api_key="fAZMudlG7f2M0CtUYWrQikcrxzzRoPze5UbW81nA"
#             )

from langchain_community.llms import Ollama

client = Ollama(model="llama2")



### Perform Semantic Chunking

In [28]:
from langchain.text_splitter import SpacyTextSplitter

def spacy_text_splitter(text):
    '''
    @param text: a long string
    @return: a list of strings. Each string is a chunk of the text.
    '''

    chunk_size = 500

    if len(text) > 2000:
        chunk_size = 800

    text_splitter = SpacyTextSplitter(
        pipeline="en_core_web_sm",
        chunk_size=chunk_size,
        chunk_overlap=0,
    )
    texts = text_splitter.split_text(text)
    return texts

In [29]:
advanced_chunks = spacy_text_splitter(processed_text)
for chunk in advanced_chunks[:15]:
  print(chunk+ "\n")

1. summarize the 6 phases of release engineering discussed in the article and explain the role each phase plays in the overall release process.

12 sentences per phase is sufficient 1 integration branching and merging this phase involves merging code changes from individual development branches into the teams branch and eventually into the projects master branch, ensuring smooth collaboration and code synchronization.

2 continuous integration building and testing here, code changes are continuously integrated into the mainline codebase.

this process involves automatically building and testing new commits or merges to identify any regressions promptly.

3 build system the build system encompasses the specifications used to generate project deliverables such as binaries or packages from the source code.

its crucial for ensuring consistency and reliability in the build process.

4 infrastructureascode this phase involves defining and managing infrastructure using code, allowing for aut



In [30]:
advanced_index = pc.Index(name="test3") # you have to change the index when you run the code again
upSertEmbeds(advanced_chunks, advanced_index)

In [31]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

def get_response(query, model, top_k_val, index):
    '''
    @param query: a string. The question to ask the model.
    @param model: a string. The model to use for the response.
    @param recall: an int. The number of documents to retrieve.
    @return: a string. The response from the model.
    '''
    query_vector = embed([query])
    pc = PineconeClient(api_key="1bba0667-e178-4139-a1a6-34f805bae975")

    top_k_chunks = index.query(
                        vector = query_vector,
                        top_k = top_k_val,
                        include_values = False,
                        include_metadata = True
                    )
    
    retrieved_chunks = [match['metadata'].get('text', 'Default text') for match in top_k_chunks['matches']]

    context = ''

    for chunk in retrieved_chunks:
        context += chunk + ' '

    # RAG prompt
    template =  """
                Answer the question based only on the following context:
                {context}
                Question: {question}
                """
    
    prompt = ChatPromptTemplate.from_template(template)

    # RAG
    chain = (
        RunnableParallel(
            {"context": lambda x: context, "question": RunnablePassthrough()})
        | prompt
        | model
        | StrOutputParser()
    )

    response = chain.invoke(query)

    return response

In [32]:
query = "what is release engineering?"
response = get_response(query, client, 5, advanced_index)
print(response)

Release engineering is a field of software engineering that focuses on the processes and practices involved in building, testing, and deploying software releases. It ensures that the software can be delivered to customers or end-users reliably and efficiently. Release engineering encompasses various activities such as integration, branching, merging, continuous integration, testing, and deployment. The overall goal is to facilitate smooth and frequent software releases, improve software quality, and enhance the efficiency of the software development and delivery process. Modern release engineering practices involve adopting tools and methodologies that improve collaboration, automation, and quality assurance throughout the software release lifecycle.


### Create the Following Datasets

Questions — synthetically generated (grogq-mixtral-8x7b-32768)

Contexts — created above(Synthetic data chunks)

Ground Truths — synthetically generated (grogq-mixtral-8x7b-32768)

Answers — generated from our Semantic RAG Chain

### Ragas Assessment for naive Chunker

In [33]:
questions = []
ground_truths_semantic = []
contexts = []
answers = []

question_prompt = """\
You are a teacher preparing a test. Please create a question that can be answered by referencing the following context.

Context:
{context}
"""

question_prompt = ChatPromptTemplate.from_template(question_prompt)

ground_truth_prompt = """\
Use the following context and question to answer this question using *only* the provided context.

Question:
{question}

Context:
{context}
"""

ground_truth_prompt = ChatPromptTemplate.from_template(ground_truth_prompt)

question_chain = (question_prompt
        | client
        | StrOutputParser()
    )
ground_truth_chain = ground_truth_prompt | client | StrOutputParser()

In [34]:
for chunk in naive_chunks[0:5]:
  questions.append(question_chain.invoke({"context" : chunk}))
  contexts.append([chunk])
  ground_truths_semantic.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : contexts[-1]}))
  answers.append(get_response(questions[-1], client, 5, naive_index))

### Format the content generated into HuggingFace Dataset Format

In [35]:
from datasets import Dataset

qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_semantic):
  qagc_list.append({
      "question" : question,
      "answer" : answer,
      "contexts" : context,
      "ground_truth" : ground_truth
  })

naive_eval_dataset = Dataset.from_list(qagc_list)
naive_eval_dataset

Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 5
})

### Implement Ragas metrics and evaluate our created dataset.

In [36]:
co = cohere.Client(api_key="fAZMudlG7f2M0CtUYWrQikcrxzzRoPze5UbW81nA")

from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

#
from ragas import evaluate

naive_result = evaluate(
    naive_eval_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
     llm=client, 
    embeddings=co,
    raise_exceptions=False
)

Evaluating:  10%|█         | 2/20 [00:02<00:18,  1.03s/it]Runner in Executor raised an exception
Traceback (most recent call last):
  File "/Users/chenzhenxu/CodeProjects/CS217-final-project/venv/lib/python3.9/site-packages/ragas/executor.py", line 79, in _aresults
    r = await future
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/asyncio/tasks.py", line 614, in _wait_for_one
    return f.result()  # May raise f.exception().
  File "/Users/chenzhenxu/CodeProjects/CS217-final-project/venv/lib/python3.9/site-packages/ragas/executor.py", line 38, in sema_coro
    return await coro
  File "/Users/chenzhenxu/CodeProjects/CS217-final-project/venv/lib/python3.9/site-packages/ragas/executor.py", line 112, in wrapped_callable_async
    return counter, await callable(*args, **kwargs)
  File "/Users/chenzhenxu/CodeProjects/CS217-final-project/venv/lib/python3.9/site-packages/ragas/metrics/base.py", line 116, in ascore
    raise e
  Fil

In [37]:
naive_results_df = naive_result.to_pandas()
naive_results_df

Unnamed: 0,question,answer,contexts,ground_truth,context_precision,faithfulness,answer_relevancy,context_recall
0,Sure! Here's a question that can be used to as...,## Question: Explain the first two phases of r...,[1. summarize the 6 phases of release engineer...,"The first phase of release engineering, Integr...",,,,
1,Here's a question that can be formulated based...,"The correct answer is: The ""Infrastructure as ...",[3 build system the build system encompasses t...,The answer is: 4. Infrastructure as Code. This...,,,,
2,Here is a question that can be answered using ...,One prominent challenge in the release phase o...,"[6 release finally, the release phase involves...",One challenge in the release phase of Continuo...,,,,
3,Here is a potential question for a test based ...,Here is a response to the question based on th...,"[in this case, some models are built to predic...",Here is the answer crafted from the provided c...,1.0,1.0,1.0,1.0
4,Here is a potential question for a test based ...,Test Question: What statistical methods can be...,"[however, this strategy needs a statistical or...",Test Question: What statistical methods can be...,1.0,1.0,1.0,1.0


### Ragas Assessment Comparison for advanced Chunker

In [38]:
import tqdm
questions = []
ground_truths_semantic = []
contexts = []
answers = []
for chunk in tqdm.tqdm(advanced_chunks[0:5]):
  questions.append(question_chain.invoke({"context" : chunk}))
  contexts.append([chunk])
  ground_truths_semantic.append(ground_truth_chain.invoke({"question" : questions[-1], "context" : contexts[-1]}))
  answers.append(get_response(question[-1], client, 5, advanced_index))

  0%|          | 0/5 [00:00<?, ?it/s]


TooManyRequestsError: status_code: 429, body: {'message': "You are using a Trial key, which is limited to 1000 API calls / month. You can continue to use the Trial key for free or upgrade to a Production key with higher rate limits at 'https://dashboard.cohere.com/api-keys'. Contact us on 'https://discord.gg/XW44jPfYJu' or email us at support@cohere.com with any questions"}

In [None]:
qagc_list = []

for question, answer, context, ground_truth in zip(questions, answers, contexts, ground_truths_semantic):
  qagc_list.append({
      "question" : question,
      "answer" : answer,
      "contexts" : context,
      "ground_truth" : ground_truth
  })

advanced_eval_dataset = Dataset.from_list(qagc_list)
advanced_eval_dataset

: 

In [None]:
advanced_result = evaluate(
    advanced_eval_dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
    llm=client, 
    embeddings=co,
    raise_exceptions=False
)

: 

In [None]:
advanced_results_df = advanced_result.to_pandas()
advanced_results_df

: 