In [9]:
# Loading the packages
from langchain.vectorstores import Qdrant
from langchain_openai import OpenAIEmbeddings
from qdrant_client import QdrantClient
import os
from qdrant_client.http import models
#from langchain.document_loaders import PyPDFLoader


In [2]:
# Loading the environment keys
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]
qdrant_api_key = os.environ["QDRANT_API_KEY"]
qdrant_url = os.environ["QDRANT_URL"]


In [3]:
# client initialization
qdrant_client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)


In [7]:
# In Qdrant we create cluster and then we create collection which is more like vector db table in it
# create collection
QDRANT_COLLECTION_NAME = "my-collection-Semantic-1"

qdrant_client.create_collection(
    collection_name=QDRANT_COLLECTION_NAME,
    vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
    #init_from=models.InitFrom(collection=QDRANT_COLLECTION_NAME),
)


True

In [64]:
import PyPDF2

# Using a context manager to open and read a PDF
with open('Introduction_to_transformers_NLP.pdf', 'rb') as file:
    pdf_reader = PyPDF2.PdfReader(file)
    # Reading all pages
    x = ""
    for page_number, page in enumerate(pdf_reader.pages):
        print(f"Page {page_number + 1}:")
        print(page.extract_text())
        x=x+page.extract_text()+" "


Page 1:
Introduction to Transformers
Introduction to Transformers: an NLP Perspective
Tong Xiao xiaotong@mail.neu.edu.cn
NLP Lab., Northeastern University, Shenyang, China
NiuTrans Research, Shenyang, China
Jingbo Zhu zhujingbo@mail.neu.edu.cn
NLP Lab., Northeastern University, Shenyang, China
NiuTrans Research, Shenyang, China
Abstract
Transformers have dominated empirical machine learning models of natural language pro-
cessing. In this paper, we introduce basic concepts of Transformers and present key tech-
niques that form the recent advances of these models. This includes a description of the
standard Transformer architecture, a series of model refinements, and common applica-
tions. Given that Transformers and related deep learning techniques might be evolving in
ways we have never seen, we cannot dive into all the model details or cover all the tech-
nical areas. Instead, we focus on just those concepts that are helpful for gaining a good
understanding of Transformers and their 

In Character Text splitter and Recursive Character text splitter we took physical positioning of the words into account. We assume the paragraph contains similar information but that is not the case. Chunk size when doing RAG is an annoying hyperparam and it feels naive to tune it to a global constant value. We do embedding based chunking.


In [66]:
len(x)


301533

In [67]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(OpenAIEmbeddings())


In [68]:
docs = text_splitter.create_documents([x])
print(docs[0].page_content)


Introduction to Transformers
Introduction to Transformers: an NLP Perspective
Tong Xiao xiaotong@mail.neu.edu.cn
NLP Lab., Northeastern University, Shenyang, China
NiuTrans Research, Shenyang, China
Jingbo Zhu zhujingbo@mail.neu.edu.cn
NLP Lab., Northeastern University, Shenyang, China
NiuTrans Research, Shenyang, China
Abstract
Transformers have dominated empirical machine learning models of natural language pro-
cessing. In this paper, we introduce basic concepts of Transformers and present key tech-
niques that form the recent advances of these models. This includes a description of the
standard Transformer architecture, a series of model refinements, and common applica-
tions. Given that Transformers and related deep learning techniques might be evolving in
ways we have never seen, we cannot dive into all the model details or cover all the tech-
nical areas. Instead, we focus on just those concepts that are helpful for gaining a good
understanding of Transformers and their variants

In [72]:
# Putting the data to vector store
embeddings = OpenAIEmbeddings()

vector_store = Qdrant(
    client=qdrant_client,
    collection_name=QDRANT_COLLECTION_NAME,
    embeddings=embeddings,
)


In [74]:
# Step 4: Add the documents to Quadrant
vector_store.add_documents(docs)


['0242b708c3cb4856be71f50fa4c6f9f2',
 '90398af5f0a846c08b879afd164a88a7',
 'cf39d4e3e75b4e488267d90774393aba',
 '6cdd16849db841098d73b4976399e00a',
 '668c9026e8da49b29b118063145e06d1',
 '91200b5c8966450ea9ee01586379a0b0',
 '9c4eab3bed7d4ce58ec00b4aece2ffd5',
 '256d3dd781fb464c8639615643c528ef',
 '47504352330b435b92862c52177c3454',
 '97c782bc22ff4e48ac773760ae89e574',
 '2268295eb12d4cfc8c4bc6f093d296b5',
 '7bdd08edf3a343fe8e733ae7f9f4af29',
 'c5561038f1a242c58bf8c4c3765bc1db',
 'ca958c625d46402f96092623c9a0b48a',
 '0c4c60407c5f4255be91435a3d4d968d',
 '451cafe41031457fa08a9eeea1e7a032',
 '0385b02d29064691ae0995b74671d57f',
 '45948be3ec004bd1b28df7f6b0ae14c6',
 '80eb72daa3ac41b9a8cdf453e2df5630',
 '08a382d40e6c4873b8daebc020384eab',
 'd529456548b94b04b2b80aea888b1522',
 '183058fc07a949f8b49b583fa8eb96b5',
 '937f2c8774dc4f309b5baa7cd6a85a7d',
 'd83d6272f9f74674981ad0d0e567f74f',
 'c3400e60b25447a3afb5079292d838fe',
 '44a3899ab1504bb7b074383f5986c419',
 '36447ae1419c4b46b104a44f2e4fd440',
 