In [2]:
# !pip install redis

In [3]:
import langchain
from langchain.document_loaders import TextLoader, PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import Redis
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI

import warnings
warnings.filterwarnings("ignore")

# Document Loading

In [4]:
# As of now I'm considering only one document for the RAG, so I'll be going with PyPDFLoader. 
# I'll try use directory loader in the backend if required

# loading a document using PyPDFLoader

data_path = r"C:\Users\KeesaramRupesh\Downloads\NBEATS paper.pdf"
data_loader = PyPDFLoader(data_path)

data = data_loader.load()

In [5]:
# len of the document

print(f"Total Pages Available :{len(data)}")

Total Pages Available :31


In [6]:
for page in data:
    print(page)

page_content='Published as a conference paper at ICLR 2020
N-BEATS: N EURAL BASIS EXPANSION ANALYSIS FOR
INTERPRETABLE TIME SERIES FORECASTING
Boris N. Oreshkin
Element AI
boris.oreshkin@gmail.comDmitri Carpov
Element AI
dmitri.carpov@elementai.com
Nicolas Chapados
Element AI
chapados@elementai.comYoshua Bengio
Mila
yoshua.bengio@mila.quebec
ABSTRACT
We focus on solving the univariate times series point forecasting problem using
deep learning. We propose a deep neural architecture based on backward and
forward residual links and a very deep stack of fully-connected layers. The ar-
chitecture has a number of desirable properties, being interpretable, applicable
without modiﬁcation to a wide array of target domains, and fast to train. We test
the proposed architecture on several well-known datasets, including M3, M4 and
TOURISM competition datasets containing time series from diverse domains. We
demonstrate state-of-the-art performance for two conﬁgurations of N-BEATS for
all the dataset

In [7]:
first_page = data[1] 

first_page.page_content

'Published as a conference paper at ICLR 2020\ninject a suitable inductive bias in the model to make its internal operations more interpretable, in the\nsense of extracting some explainable driving factors combining to produce a given forecast?\n1.1 S UMMARY OF CONTRIBUTIONS\nDeep Neural Architecture: To the best of our knowledge, this is the ﬁrst work to empirically\ndemonstrate that pure DL using no time-series speciﬁc components outperforms well-established\nstatistical approaches on M3, M4 and TOURISM datasets (on M4, by 11% over statistical benchmark,\nby 7% over the best statistical entry, and by 3% over the M4 competition winner). In our view, this\nprovides a long-missing proof of concept for the use of pure ML in TS forecasting and strengthens\nmotivation to continue advancing the research in this area.\nInterpretable DL for Time Series: In addition to accuracy beneﬁts, we also show that it is fea-\nsible to design an architecture with interpretable outputs that can be used by

# Cleaning The Text

In [8]:
import re
from langchain_core.documents import Document
from functools import reduce

In [9]:
def clean_text(text):
    """
    Cleans the input text by:
    - Removing special characters that are not relevant to English text.
    - Replacing consecutive periods, commas, exclamation marks, or question marks with a single one.
    - Collapsing multiple spaces into a single space.
    - Removing leading and trailing whitespace.
    """
    text = re.sub(r"[^\w\s\.\,\?\!\-']", "", text)
    
    text = re.sub(r"(\.{2,})", ".", text)  
    text = re.sub(r"(\,{2,})", ",", text)  
    text = re.sub(r"(\!{2,})", "!", text)  
    text = re.sub(r"(\?{2,})", "?", text)  
    
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()


In [10]:
cleaned_docs = []

In [11]:
for page in data:
    cleaned_page =Document(page_content=clean_text(page.page_content),metadata=page.metadata)
    cleaned_docs.append(cleaned_page)

In [12]:

# ans = len(reduce(lambda x, y: x + y.page_content, data, ""))
# ans


In [13]:
print(f"Total Character Before Cleaning {len(reduce(lambda x, y: x + y.page_content, data, ''))}")
print(f"Total Character After Cleaning {len(reduce(lambda x, y: x + y.page_content, cleaned_docs, ''))}")

Total Character Before Cleaning 91599
Total Character After Cleaning 89917


* There are most advanced type of cleanings also we can perform using pdfplumber, pdfminer,pymupdf and etc. Due to time constraints I'm sticking to basic ones

# Splitting The Document Into Chunks

* Let's go with SemanticChunker instead of CharacterTextSplitter or Recursive text splitters and etc
* SemanticChunker Requires Embedding models to work - it works by analyzing the relevance of a particular sentence with the following sentences, until the provided threshold exceeds, it keeps on appending those sentences into a single chunk. 

In [14]:
from dotenv import load_dotenv

In [15]:
load_dotenv("devenv.env")

True

In [16]:
def semantic_text_splitter(text_splitter,documents):
    """
    Splits the input text into semantically meaningful chunks using LangChain's SemanticChunker.
    """
    semantic_chunks = text_splitter.split_documents(documents)
    return semantic_chunks

In [17]:
text_splitter = SemanticChunker(OpenAIEmbeddings())

In [18]:
%%time
semantic_chunks = semantic_text_splitter(text_splitter,cleaned_docs)

CPU times: total: 2.98 s
Wall time: 45.2 s


In [19]:
len(semantic_chunks)

81

In [20]:
for chunk in semantic_chunks[:5]:
    print(chunk.page_content,end="\n\n\n")

Published as a conference paper at ICLR 2020 N-BEATS N EURAL BASIS EXPANSION ANALYSIS FOR INTERPRETABLE TIME SERIES FORECASTING Boris N. Oreshkin Element AI boris.oreshkingmail.comDmitri Carpov Element AI dmitri.carpovelementai.com Nicolas Chapados Element AI chapadoselementai.comYoshua Bengio Mila yoshua.bengiomila.quebec ABSTRACT We focus on solving the univariate times series point forecasting problem using deep learning. We propose a deep neural architecture based on backward and forward residual links and a very deep stack of fully-connected layers.


The ar- chitecture has a number of desirable properties, being interpretable, applicable without modiﬁcation to a wide array of target domains, and fast to train. We test the proposed architecture on several well-known datasets, including M3, M4 and TOURISM competition datasets containing time series from diverse domains. We demonstrate state-of-the-art performance for two conﬁgurations of N-BEATS for all the datasets, improving fore

In [38]:
def setup_redis_vector_store(split_docs, redis_url="redis://localhost:6379", index_name="langchain-index"):
    """
    Sets up the Redis vector store using LangChain's Redis integration.
    """
    embeddings = OpenAIEmbeddings()
    vector_store = Redis.from_documents(
        split_docs,
        embedding=embeddings,
        redis_url=redis_url,
        index_name=index_name,
    )
    return vector_store

In [68]:
import os

REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379")
print(f"Connecting to Redis at: {REDIS_URL}")

Connecting to Redis at: redis://localhost:6379


In [69]:
import redis

redis_client = redis.from_url(REDIS_URL)
redis_client.ping()

True

In [83]:
%%time
redis_store = setup_redis_vector_store(semantic_chunks)

CPU times: total: 1.5 s
Wall time: 4.11 s


## Deleting Index From the Vector Store

In [80]:
redis_client = redis.from_url(REDIS_URL)

In [99]:
index_name = "doc_index"
index_keys = redis_client.keys(f"{index_name}:*")

In [100]:
index_keys[1:3]

[b'doc_index:8ff42cf01dde4fe3aa0ee0f3d35d07af',
 b'doc_index:c5f970d0b01b4554bae36f83a9e84d90']

In [87]:
# We can uncomment this when we want to delete

# if index_keys:
#     redis_client.delete(*index_keys)
#     print(f"Index '{index_name}' and all associated keys deleted successfully.")
# else:
#     print(f"No keys found for index '{index_name}'.")

In [101]:

def retrieve_from_vector_store(query, vector_store, top_k=5):
    """
    Retrieves the most similar documents from the Redis vector store for a given query.
    """
    
    # Retrieve the top_k most similar documents
    results = vector_store.similarity_search_with_score(query, k=top_k)
    
    return results

In [102]:
# Example usage
query = "What is the time series approach that is proposed ?"
top_k_results = retrieve_from_vector_store(query, redis_store, top_k=3)

In [103]:
# Display results
for result in top_k_results:
    print(f"Metadata : {result[0].metadata} \n Similarity Score : {result[-1]}",end="\n\n\n")

Metadata : {'id': 'doc:langchain-index:f056571c056f49889292947975e39d00', 'source': 'C:\\Users\\KeesaramRupesh\\Downloads\\NBEATS paper.pdf', 'page': '0'} 
 Similarity Score : 0.1718


Metadata : {'id': 'doc:langchain-index:9552189be45e489e9d14bc2ead15a646', 'source': 'C:\\Users\\KeesaramRupesh\\Downloads\\NBEATS paper.pdf', 'page': '1'} 
 Similarity Score : 0.1725


Metadata : {'id': 'doc:langchain-index:f137d5bc06cf4feeb5b6eb001bb5978c', 'source': 'C:\\Users\\KeesaramRupesh\\Downloads\\NBEATS paper.pdf', 'page': '3'} 
 Similarity Score : 0.1733




In [96]:
# Display results
for result in top_k_results:
    print(result[0].page_content,end="\n\n\n")

Published as a conference paper at ICLR 2020 N-BEATS N EURAL BASIS EXPANSION ANALYSIS FOR INTERPRETABLE TIME SERIES FORECASTING Boris N. Oreshkin Element AI boris.oreshkingmail.comDmitri Carpov Element AI dmitri.carpovelementai.com Nicolas Chapados Element AI chapadoselementai.comYoshua Bengio Mila yoshua.bengiomila.quebec ABSTRACT We focus on solving the univariate times series point forecasting problem using deep learning. We propose a deep neural architecture based on backward and forward residual links and a very deep stack of fully-connected layers.


Published as a conference paper at ICLR 2020 inject a suitable inductive bias in the model to make its internal operations more interpretable, in the sense of extracting some explainable driving factors combining to produce a given forecast? 1.1 S UMMARY OF CONTRIBUTIONS Deep Neural Architecture To the best of our knowledge, this is the ﬁrst work to empirically demonstrate that pure DL using no time-series speciﬁc components outperfo