In [1]:
import os
import openai
import sys

from dotenv import load_dotenv, find_dotenv

from glob import glob
from os.path import join

from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from langchain.llms import OpenAI

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, ConversationalRetrievalChain

from langchain.memory import ConversationBufferMemory



In [2]:
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

# Creating ChromaDB

In [3]:
files_dir = 'data'

# Use a função glob para listar todos os arquivos PDF no diretório
pdf_files = glob(f'{files_dir}/*.pdf')

docs=[]

for arquivo in tqdm(pdf_files):
    print(arquivo)

    loader = PyPDFLoader(arquivo)
    docs.extend(loader.load())

  0%|          | 0/34 [00:00<?, ?it/s]

data\35-GHz Barium Hexaferrite or PDMS Composite-Based Millimeter-Wave Circulators for 5G Applications.pdf


  3%|▎         | 1/34 [00:01<00:33,  1.03s/it]

data\A comprehensive survey on machine learning for networking - Evolution, applications and research opportunities.pdf


  6%|▌         | 2/34 [00:06<01:56,  3.63s/it]

data\A gated dilated causal convolution based encoder-decoder for network traffic forecasting.pdf


  9%|▉         | 3/34 [00:07<01:13,  2.37s/it]

data\A network traffic forecasting method based on SA optimized ARIMA-BP neural network.pdf


 12%|█▏        | 4/34 [00:11<01:30,  3.01s/it]

data\A Self-Adaptive Deep Learning-Based System for Anomaly Detection in 5G Networks.pdf


 15%|█▍        | 5/34 [00:14<01:23,  2.89s/it]

data\A Survey on Big Data for Network Traffic Monitoring and Analysis.pdf


 18%|█▊        | 6/34 [00:16<01:17,  2.78s/it]

data\Analyzing and modeling spatio-temporal dependence of cellular traffic at city scale.pdf


 21%|██        | 7/34 [00:21<01:37,  3.62s/it]

data\Application of Machine Learning in Wireless Networks - Key Techniques and Open Issues.pdf


 24%|██▎       | 8/34 [00:25<01:36,  3.71s/it]

data\Backhauling 5G Small Cells - A Radio Resource Management Perspective.pdf


 26%|██▋       | 9/34 [00:26<01:07,  2.70s/it]

data\Beyond Moran’s I - Testing for spatial dependence based on the spatial autoregressive model.pdf


 29%|██▉       | 10/34 [00:31<01:21,  3.38s/it]

data\Big data-driven optimization for mobile networks toward 5G.pdf


 32%|███▏      | 11/34 [00:32<01:04,  2.81s/it]

data\Characterizing the spatio-temporal inhomogeneity of mobile traffic in large-scale cellular data networks.pdf


 38%|███▊      | 13/34 [00:33<00:33,  1.60s/it]

data\Clustering to Enhance Network Traffic Forecasting.pdf
data\Deep learning in mobile and wireless networking - A survey.pdf


 41%|████      | 14/34 [00:39<00:59,  2.99s/it]

data\Deploying Virtual Network Functions With Non-Uniform Models in Tree-Structured Networks.pdf


 44%|████▍     | 15/34 [00:43<01:01,  3.22s/it]

data\Design of mm-Wave Slow-Wave-Coupled Coplanar Waveguides.pdf


 47%|████▋     | 16/34 [00:49<01:09,  3.84s/it]

data\Exploring Network-Wide Flow Data With Flowyager.pdf


 50%|█████     | 17/34 [01:30<04:19, 15.29s/it]

data\Generative-Adversarial-Network-Based wireless channel modeling - Challenges and opportunities.pdf


 53%|█████▎    | 18/34 [01:31<02:52, 10.76s/it]

data\Hierarchical, virtualised and distributed intelligence 5G architecture for low-latency and secure applications.pdf


 56%|█████▌    | 19/34 [01:31<01:55,  7.71s/it]

data\Improving traffic forecasting for 5G core network scalability - A machine learning approach.pdf


 59%|█████▉    | 20/34 [01:33<01:20,  5.77s/it]

data\Long-term mobile traffic forecasting using deep spatio-temporal neural networks.pdf


 62%|██████▏   | 21/34 [01:44<01:38,  7.59s/it]

data\Massive MIMO CSI Feedback Based on Generative Adversarial Network.pdf


 65%|██████▍   | 22/34 [01:45<01:05,  5.46s/it]

data\Orchestrating Virtualized Network Functions.pdf


 68%|██████▊   | 23/34 [01:49<00:55,  5.02s/it]

data\Representational power of Restricted Boltzmann Machines and Deep Belief Networks.pdf


 71%|███████   | 24/34 [01:49<00:36,  3.66s/it]

data\Resource Allocation in NFV- A Comprehensive Survey.pdf


 74%|███████▎  | 25/34 [01:55<00:38,  4.31s/it]

data\Resource Sharing Efficiency in Network_Slicing.pdf


 76%|███████▋  | 26/34 [02:06<00:49,  6.17s/it]

data\Spatial modeling of the traffic density in cellular networks.pdf


 82%|████████▏ | 28/34 [02:06<00:18,  3.14s/it]

data\Spatial Traffic Distribution In Cellular Networks.pdf
data\Spatio-temporal analysis and prediction of cellular traffic in metropolis.pdf


 85%|████████▌ | 29/34 [02:08<00:14,  2.80s/it]

data\Spatiotemporal modeling and prediction in cellular networks - A big data enabled deep learning approach.pdf


 88%|████████▊ | 30/34 [02:09<00:08,  2.12s/it]

data\TANGO - Traffic-Aware Network Planning and Green Operation.pdf


 91%|█████████ | 31/34 [02:09<00:04,  1.62s/it]

data\Time4 - Time for SDN.pdf


 94%|█████████▍| 32/34 [02:13<00:04,  2.30s/it]

data\Towards Supporting Intelligence in 5G-6G Core.pdf


 97%|█████████▋| 33/34 [02:14<00:01,  1.76s/it]

data\Understanding Mobile Traffic Patterns of Large Scale Cellular Towers in Urban Environment.pdf


100%|██████████| 34/34 [02:18<00:00,  4.09s/it]


In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

splits = text_splitter.split_documents(docs)

In [5]:
#!rm -Rf /Users/patrick/Documents/Pessoal/paper-rag/chroma/

embedding = OpenAIEmbeddings()
persist_directory = 'chroma_full'

vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

  warn_deprecated(


# Model

In [6]:
llm = OpenAI(temperature=0)  # Model used

  warn_deprecated(


# Template

In [7]:
# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you can't make a answer with context, just say that you don't know, don't try to make up an answer. Give the document name from where the information is extracted
{context}
Question: {question}
Helpful Answer:"""

QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Compression retrieval method configuration

In [8]:
# Wrap our vectorstore
compressor = LLMChainExtractor.from_llm(llm)  # uses an LLMChain to extract from each document only the statements that are relevant to the query.

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever(search_type = "mmr")  # Retriever method using MMR retrieval to ensure variability in the answer
)

# Chain assembling

In [9]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=compression_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [10]:
question = "What is Nnwdaf_AnalyticsSubscriptionservice?"
result = qa_chain({"query": question})

print(result["result"])
print('\n')

for source in result['source_documents']:
    print ('Source: ', source.metadata['source'].split('/')[-1])
    print('Page: ', source.metadata['page'])
    print('\n')

  warn_deprecated(


 The Nnwdaf_AnalyticsSubscriptionservice is a service provided by the NWDAF (Network Data Analytics Function) that notifies the NF (Network Function) consumer instance of all analytics subscribed to the specific NWDAF service. This information is extracted from the document "NWDAF Services and Interfaces" which describes the different services provided by the NWDAF.


Source:  data\Towards Supporting Intelligence in 5G-6G Core.pdf
Page:  1


Source:  data\Resource Sharing Efficiency in Network_Slicing.pdf
Page:  5


Source:  data\Improving traffic forecasting for 5G core network scalability - A machine learning approach.pdf
Page:  5


