In [8]:
# !pip install black
# !pip install tiktoken
# !pip install pypdf
# !pip install chromadb
# !pip install langchain
# !pip install -U langchain-community

In [1]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv

from glob import glob
from os.path import join

from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.llms import OpenAI

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

from langchain.memory import ConversationBufferMemory



In [2]:
load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Creating ChromaDB

In [7]:
import pickle
with open("docs.pkl", "wb") as file:
    pickle.dump(docs, file)

In [3]:
files_dir = '/home/patrick/Documents/Github/paper-rag/inputs'

# # Use a função glob para listar todos os arquivos PDF no diretório
# pdf_files = glob(f'{files_dir}/*.pdf')

# docs=[]

# # Percorre todos os arquivos PDF da base e extrai o texto dos mesmos com o PyPDFLoader
# for arquivo in tqdm(pdf_files):
#     print(arquivo)

#     loader = PyPDFLoader(join(files_dir, arquivo))
#     docs.extend(loader.load())

import pickle
with open("docs.pkl", "rb") as file:
    docs = pickle.load(file)


In [4]:
# Processo de chunking: pega os textos e quebra em chunks/pedaços

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,  # Tamanho do chunk (em tokens)
    chunk_overlap = 150  # Sobreposição dos chunks para evitar a quebra abrupta de temas
)

splits = text_splitter.split_documents(docs)

In [5]:
#!rm -Rf /Users/patrick/Documents/Pessoal/paper-rag/chroma/

embedding = OpenAIEmbeddings()
persist_directory = '../chroma_full/'

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

  warn_deprecated(


# Model

In [6]:
llm = OpenAI(temperature=0)  # Model used

  warn_deprecated(


# Template

In [7]:
# Build prompt
template = """Use the following context to answer the question at the end. If you can't make a answer with context, just say that you don't know, don't try to make up an answer.
Context: {context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Compression retrieval method configuration

In [8]:
# Wrap our vectorstore
compressor = LLMChainExtractor.from_llm(llm)  # uses an LLMChain to extract from each document only the statements that are relevant to the query.

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")  # Retriever method using MMR retrieval to ensure variability in the answer
)

# Memory

Ref: https://github.com/langchain-ai/langchain/issues/2256

In [9]:
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True,
    output_key='result'
)

# Chain assembling

In [10]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=compression_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    # memory=memory
)

In [11]:
question = "What is 5G?"
result = qa_chain({"query": question})

print(result["result"])
print('\n')

for source in result['source_documents']:
    print ('Source: ', source.metadata['source'].split('/')[-1])
    print('Page: ', source.metadata['page'] + 1)
    print('\n')

  warn_deprecated(


 5G is the upcoming fifth generation of cellular technology that is expected to be rolled out by 2020. It will be a significant change in mobile networking, offering support for a wider range of services and improved quality of service. 


Source:  Backhauling 5G Small Cells - A Radio Resource Management Perspective.pdf
Page:  0


Source:  Deep learning in mobile and wireless networking - A survey.pdf
Page:  53


Source:  A Self-Adaptive Deep Learning-Based System for Anomaly Detection in 5G Networks.pdf
Page:  0


Source:  Application of Machine Learning in Wireless Networks - Key Techniques and Open Issues.pdf
Page:  3




In [13]:
question = "What are the challenges in 5G implementation?"
result = qa_chain({"query": question})

print(result["result"])
print('\n')

for source in result['source_documents']:
    print('Source: ', source.metadata['source'].split('/')[-1])
    print('Page: ', source.metadata['page'] + 1)
    print('\n')

 Some of the challenges in 5G implementation include the need for low latency, increased connection density, and the ability to support a wide range of use cases and applications. There is also a need for improved resource management, networking, mobility management, and localization algorithms to meet the performance requirements of 5G. Additionally, the explosive growth of data and the integration of AI into the network present privacy and security risks that must be addressed. Finally, there is a need to properly address these challenges in order to maximize the network's performance and ensure high returns on investment for mobile network operators.


Source:  Backhauling 5G Small Cells - A Radio Resource Management Perspective.pdf
Page:  1


Source:  Towards Supporting Intelligence in 5G-6G Core.pdf
Page:  5


Source:  Application of Machine Learning in Wireless Networks - Key Techniques and Open Issues.pdf
Page:  1


Source:  Big data-driven optimization for mobile networks towar

In [14]:
abstract="""Modeling and simulation of a cellular network typically assumes that the target area is divided into regular hexagonal cells and mobile stations (MSs) are uniformly scattered in each cell. This implies a statistically uniform distribution of traffic load over space, but in reality the spatial traffic distribution is highly non-uniform across different cells, which calls for actual spatial traffic models. In this article, we first present the analysis of traffic measurements collected from commercial cellular networks in China, and demonstrate that the spatial distribution of the traffic density (the traffic load per unit area) can be approximated by the log-normal or Weibull distribution depending on time and space. Then we propose a spatial traffic model which generates large-scale spatial traffic variations by a sum of sinusoids that captures the characteristics of log-normally distributed and spatially correlated cellular traffic. The proposed model can be directly used to generate realistic spatial traffic patterns for cellular network simulations, such as performance evaluations of network planning and load balancing."""

question = "Find a abstract similar to the mentioned below: " + abstract
result = qa_chain({"query": question})

print(result["result"])
print('\n')

for source in result['source_documents']:
    print('Source: ', source.metadata['source'].split('/')[-1])
    print('Page: ', source.metadata['page'] + 1)
    print('Content: ', source.page_content)
    print('\n')

 "This study examines the spatial distribution of traffic load in cellular networks and proposes a model that captures the non-uniformity of this distribution. By analyzing traffic measurements from commercial networks, the authors demonstrate that the log-normal or Weibull distribution can be used to approximate the spatial traffic density. The proposed model, which incorporates sinusoidal variations, can be applied in cellular network simulations to generate realistic traffic patterns for performance evaluations."


Source:  Spatial modeling of the traffic density in cellular networks.pdf
Page:  1
Content:  - Modeling and simulation of a cellular network typically assumes that the target area is divided into regular hexagonal cells and mobile stations (MSs) are uniformly scattered in each cell.
- This implies a statistically uniform distribution of traffic load over space, but in reality the spatial traffic distribution is highly non-uniform across different cells, which calls for ac