In [48]:
import os
import time
import arxiv
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.pydantic_v1 import BaseModel
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableParallel, RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
#from langchain_community import embeddings
import ssl
ssl._create_default_https_context = ssl._create_stdlib_context

### - Ollama RAG pipeline which automatically retrieves top arxiv papers related to 'LLMs in Telecom sector' and model (llama2 or mistral) uses these papers as context to produce more relavant and accurate responses
### - Ragas Library is to asses the RAG pipeline based on metrics like faithfulness, context_precision, context_recall and answer_relevancy

NOTE: Ragas library doesn't work with llama2 but works with mistral since ragas was initially designed for openaai models but with a workaround I have used mistral

- First download ollama form ollama.com/download and then start it by running the ollama file
- Before executing notebook, do following in cmd
- enter virtual environment and then
- ollama pull mistral

In [49]:
# !pip install git+https://github.com/explodinggradients/ragas.git
# !pip install -U langchain langchain-community langchain-core BeautifulSoup4 tiktoken chromadb arxiv

In [50]:
# Create directory if not exists
dirpath = "arxiv_papers_ragas"
if not os.path.exists(dirpath):
    os.makedirs(dirpath)

# Search arXiv for papers related to "LLM" (combination of keyword matching and most relavant)
client = arxiv.Client()
search = arxiv.Search(
    query="LLM AND telecom", # AND is Boolean operator here
    max_results=5,
    sort_order=arxiv.SortOrder.Descending
)

# Download and save the papers
for result in client.results(search):
    while True:
        try:
            result.download_pdf(dirpath=dirpath)
            print(f"-> Paper id {result.get_short_id()} with title '{result.title}' is downloaded.")
            break
        except (FileNotFoundError, ConnectionResetError) as e:
            print("Error occurred:", e)
            time.sleep(5)

-> Paper id 2308.06013v2 with title 'Large Language Models for Telecom: Forthcoming Impact on the Industry' is downloaded.
-> Paper id 2306.07933v1 with title 'Understanding Telecom Language Through Large Language Models' is downloaded.
-> Paper id 2310.15051v1 with title 'TeleQnA: A Benchmark Dataset to Assess Large Language Models Telecommunications Knowledge' is downloaded.
-> Paper id 2310.11770v1 with title 'Telecom AI Native Systems in the Age of Generative AI -- An Engineering Perspective' is downloaded.
-> Paper id 2305.13102v1 with title 'Observations on LLMs for Telecom Domain: Capabilities and Limitations' is downloaded.


In [51]:
# Load papers from the directory
papers = []
loader = DirectoryLoader(dirpath, glob="./*.pdf", loader_cls=PyPDFLoader)
try:
    papers = loader.load()
except Exception as e:
    print(f"Error loading file: {e}")
print("Total number of pages loaded:", len(papers))

# Concatenate all pages' content into a single string
full_text = ''
for paper in papers:
    full_text += paper.page_content

# Remove empty lines and join lines into a single string
full_text = " ".join(line for line in full_text.splitlines() if line)
print("Total characters in the concatenated text:", len(full_text))

# Split the text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
paper_chunks = text_splitter.create_documents([full_text])

print("text splitted into chunks")

Total number of pages loaded: 38
Total characters in the concatenated text: 163000
text splitted into chunks


In [52]:
# Convert documents to Embeddings and store them
vectorstore = Chroma.from_documents(
    documents=paper_chunks,
    collection_name="arxiv_papers",
    embedding=GPT4AllEmbeddings(),
    #embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
)
retriever = vectorstore.as_retriever()

# Define prompt template
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# Initialize Ollama LLM
ollama_llm = "mistral" #can use llama2 as well
model = ChatOllama(model=ollama_llm)

# Define the processing chain
chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
)

# Add typing for input
class Question(BaseModel):
    __root__: str

## RAGAS library - RAG Assessment

NOTE: Model incapable of producing valid JSON outputs leads to JSON warning and hence NaN output (for answer recall and faithfulness)
- context recall runs properly but context_precision gives list doesn't have get function error

In [53]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_relevancy,
    context_recall,
    context_precision,
    answer_correctness
)

# Sample questions and ground truths for LLM in telecom sector
questions = [
    "In a sentence what are the applications of LLM in the telecom sector?"
]

ground_truth = [
    "Large Language Models (LLMs) are revolutionizing the telecom industry by enhancing network performance, streamlining difficult tasks ,Task automation, security, and customer experiences."
]

In [54]:
# Inference for the provided questions
answers = []
contexts = []

for query in questions:
    answers.append(chain.invoke(query))
    contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

print(answers)
print(contexts)

[' The applications of Large Language Models (LLMs) in the telecom sector include use cases that leverage available data for industry-specific purposes, with potential benefits from addressing open research directions such as improving practical implementation knowledge and narrowing the performance gap between context-aware LLMs and traditional models.']
[['blocks to unlock their potential. V. C ONCLUSIONS In this article, we have delved into the inner workings of LLMs, shedding light on their current capabilities and limitations. Additionally, we explored various use cases of LLMs that can be promptly leveraged within the industry using the available data at vendors’ disposal. Furthermore, we discussed the specific open research directions tailored to the peculiarities of the telecom domain, which must be addressed to fully harness the potential', 'blocks to unlock their potential. V. C ONCLUSIONS In this article, we have delved into the inner workings of LLMs, shedding light on thei

In [55]:
# Ragas wants ['question', 'answer', 'contexts', 'ground_truths'] as
'''
{
    "question": [], <-- question 
    "answer": [], <-- answer from generated result
    "contexts": [[]], <-- context (list of lists)
    "ground_truth": [] <-- actual answer
}
'''

# Organize the data into a dictionary
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truth
}

# Convert the dictionary to a dataset
dataset = Dataset.from_dict(data)

print(dataset)


Dataset({
    features: ['question', 'answer', 'contexts', 'ground_truth'],
    num_rows: 1
})


In [57]:
first_row = dataset[0]  # Access the first row as a dictionary
question = first_row['question']
answer = first_row['answer']
contexts = first_row['contexts']
ground_truth = first_row['ground_truth']



print("Question:", question)
print("Answer:", answer)
print("Contexts:", contexts)
print("Ground Truth:", ground_truth)


Question: In a sentence what are the applications of LLM in the telecom sector?
Answer:  The applications of Large Language Models (LLMs) in the telecom sector include use cases that leverage available data for industry-specific purposes, with potential benefits from addressing open research directions such as improving practical implementation knowledge and narrowing the performance gap between context-aware LLMs and traditional models.
Contexts: ['LLMs are being used to improve network performance in the telecom sector and Telecom companies are exploring how LLMs can automate tasks like network monitoring.']
Ground Truth: Large Language Models (LLMs) are revolutionizing the telecom industry by enhancing network performance, streamlining difficult tasks ,Task automation, security, and customer experiences.


In [None]:
'''
template for running models without openai_api_key
from langchain_core.language_models import BaseLanguageModel
from langchain_core.embeddings import Embeddings

langchain_llm = BaseLanguageModel(model="my_model") # any langchain LLM instance
langchain_embeddings = Embeddings(model="my_model") # any langchain Embeddings instance

results = evaluate(metrics=[], llm=langchain_llm, embeddings=embeddings)
'''
from langchain_community.embeddings import OllamaEmbeddings
embeddings_ollama = OllamaEmbeddings(model="mistral") 

from ragas import evaluate
result = evaluate(
    dataset=dataset,
    metrics=[
        context_recall,
        faithfulness,
        context_precision,
        answer_relevancy
    ],
    llm=model,
    embeddings=embeddings_ollama,
    raise_exceptions=False
    
)

#result

In [59]:
import pandas as pd
pd.set_option("display.max_colwidth", None)
df = result.to_pandas()
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,context_recall,faithfulness,context_precision,answer_relevancy
0,In a sentence what are the applications of LLM in the telecom sector?,"The applications of Large Language Models (LLMs) in the telecom sector include use cases that leverage available data for industry-specific purposes, with potential benefits from addressing open research directions such as improving practical implementation knowledge and narrowing the performance gap between context-aware LLMs and traditional models.",[LLMs are being used to improve network performance in the telecom sector and Telecom companies are exploring how LLMs can automate tasks like network monitoring.],"Large Language Models (LLMs) are revolutionizing the telecom industry by enhancing network performance, streamlining difficult tasks ,Task automation, security, and customer experiences.",0.666667,0.75,1.0,0.670276
