### Using LangChain

In [1]:
# !pip install langchain sentence-transformers langchain-community pypdf chromadb -q

In [2]:
# !pip install faiss-cpu -q

In [3]:
from platform import python_version
python_version()

'3.11.0'

In [4]:
import glob
# from google.colab import drive
# drive.mount('/content/drive')

In [8]:
# pdf_files = glob.glob('/content/drive/MyDrive/Colab Notebooks/data/*')
pdf_files = glob.glob('data/*')
pdf_files[0]

'data\\FinancialAccounting_par1.pdf'

In [6]:
import glob
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
# # List of PDF file paths
# pdf_files = [
#     'path/to/file1.pdf',
#     'path/to/file2.pdf',
#     'path/to/file3.pdf',
#     # Add more PDF file paths here
# ]

# Load and process the PDF files
docs = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(data)
    docs.extend(texts)

# Now you can work with the loaded documents
for doc in docs:
    print(doc.page_content)
    break

Volume 1


In [10]:
len(docs)

7495

In [11]:
docs[:10]

[Document(page_content='Volume 1', metadata={'source': 'data\\FinancialAccounting_par1.pdf', 'page': 0}),
 Document(page_content='Principles of Accounting, Volume 1: Financial Accounting       SENIOR CONTRIBUTING AUTHORS MITCHELL FRANKLIN, LE MOYNE COLLEGE (FINANCIAL ACCOUNTING) PATTY GRAYBEAL, UNIVERSITY OF MICHIGAN-DEARBORN (MANAGERIAL ACCOUNTING) DIXON COOPER, OUACHITA BAPTIST UNIVERSITY', metadata={'source': 'data\\FinancialAccounting_par1.pdf', 'page': 2}),
 Document(page_content='OpenStax  \nRice University \n6100 Main Street MS -375 \nHouston, Texas 77005 \nTo learn more about OpenStax, visit https ://openstax.org.  \nIndividual print copies and bulk orders can be purchased through our website.  \n©2019 Rice University.  Textbook content produced by Ope nStax is licensed under a Creative Commons \nAttribution Non -Commercial ShareAlike 4.0 International License  (CC BY -NC-SA 4.0) . Under this license, any user \nof this textbook or the textbook contents herein can share, remix,

In [12]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings

embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False)

In [14]:
from langchain.vectorstores import Chroma, FAISS

In [15]:
vectordb = FAISS.from_documents(docs,embeddings)
vectordb.save_local('vectordb')

In [18]:
from transformers import GPT2Tokenizer, GPT2Model, pipeline
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [17]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

In [20]:
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
local_llm = HuggingFacePipeline(pipeline=pipe)
local_llm

The model 'GPT2Model' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'Musicg

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000002CC06616090>)

In [25]:
!pip install langchain==0.0.326

Collecting langchain==0.0.326
  Downloading langchain-0.0.326-py3-none-any.whl.metadata (16 kB)
Collecting anyio<4.0 (from langchain==0.0.326)
  Downloading anyio-3.7.1-py3-none-any.whl.metadata (4.7 kB)
Downloading langchain-0.0.326-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.9 MB 3.5 MB/s eta 0:00:01
   -------------------------------------- - 1.8/1.9 MB 14.7 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 12.4 MB/s eta 0:00:00
Downloading anyio-3.7.1-py3-none-any.whl (80 kB)
   ---------------------------------------- 0.0/80.9 kB ? eta -:--:--
   ---------------------------------------- 80.9/80.9 kB 4.4 MB/s eta 0:00:00
Installing collected packages: anyio, langchain
  Attempting uninstall: anyio
    Found existing installation: anyio 4.3.0
    Uninstalling anyio-4.3.0:
      Successfully uni

In [26]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA

In [28]:
from langchain.chains import LLMChain

In [27]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k":2})
# create a chain to answer questions 
qa = RetrievalQA.from_chain_type(
    llm=local_llm, chain_type="stuff", retriever=retriever, return_source_documents=True)

ValidationError: 1 validation error for LLMChain
llm
  Can't instantiate abstract class BaseLanguageModel with abstract methods agenerate_prompt, apredict, apredict_messages, generate_prompt, invoke, predict, predict_messages (type=type_error)

In [None]:
query = "what is the total number of AI publications?"
result = qa({"query": query})

In [None]:
break

In [None]:
vectordb = Chroma.from_documents(docs,embeddings,persist_directory="./chroma_db")

In [None]:
retriever = vectordb.as_retriever()

In [None]:
docs[0:10]

In [None]:
from transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')
# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)


In [None]:
len(docs)

In [None]:
docs[-40:]