In [53]:
import os

# imports for openai
import openai
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain import PromptTemplate

# imports for opensource llms
from langchain_community.llms import HuggingFaceEndpoint
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.llms import ctransformers

# import for api key access
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [59]:
# pip install python-dotenv faiss-cpu langchain tiktoken openai langchain-openai ctransformers llama-cpp-python
# !pip install llama-cpp-python

In [33]:
openai.api_key = os.getenv("OPENAI_API_KEY")

## Collecting the text

In [34]:
def get_text_from_pdf(pdf_file):
    pdf_doc = PyPDF2.PdfReader(pdf_file)
    pdf_text = ''
    
    for i,page in enumerate(pdf_doc.pages):
        page_content = pdf_doc.pages[i].extract_text()
        pdf_text += page_content
    
    return pdf_text
    

In [35]:
text = get_text_from_pdf('machine_learning.pdf')

## splitting the text

In [36]:
def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000,chunk_overlap= 100)
    text_chunks = text_splitter.split_text(text)
    
    return text_chunks

In [37]:
chunked_text = get_text_chunks(text)

In [38]:
chunked_text[7]

'insurance customers are likely to file their claims.  \nUnsupervised Learning  \nThis method of ML finds its application in areas were data has no historical labels. Here, the system will not be provided with the \n"right answer" and the algorithm should identify what is being shown. The main aim here is to analyze the data and identify a  pattern and \nstructure within the available data set. Transactional data serves as a good source of data set for unsupervised learning.  \nFor instance, this type of learning identifies customer segments with similar attributes and then lets the business to treat them \nsimilarly in marketing campaigns. Similarly, it can also identify attributes that differentiate custome r segments from one another. Either \nways, it is about identifying a similar structure in the available data set. Besides, these algorithms can also identify outl iers in the available \ndata sets.  \nSome of the widely used techniques of unsupervised learning ar e - \n\uf0b7 k-m

## vector database

In [39]:
openai_embedding = OpenAIEmbeddings()

In [40]:
vectorstore = FAISS.from_texts(texts=chunked_text, embedding=openai_embedding)

In [41]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})

## query

In [61]:
# model_name = "OpenSourceModels/llama-2-7b-chat.Q5_K_M.gguf"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


# model_name = ctransformers(model='OpenSourceModels/llama-2-7b-chat.Q5_K_M.gguf')

# from langchain_community.llms import LlamaCpp
# llm = LlamaCpp(
#     model_path="OpenSourceModels/llama-2-7b-chat.Q5_K_M.gguf",
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     verbose=True,
# )

In [65]:
# llm = OpenAI()

llm = HuggingFaceEndpoint(
    repo_id="mistralai/Mistral-7B-Instruct-v0.2",
    temperature= 0.6
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\sande\.cache\huggingface\token
Login successful


In [66]:
chain = RetrievalQA.from_chain_type(llm=llm,chain_type='stuff',retriever=retriever)

In [67]:
question = "what is the difference between machine learning and deep learning"
result = chain.invoke({'query':question})

In [68]:
print(result['result'])

 Machine learning is a subset of artificial intelligence that uses algorithms and statistical models to enable computers to learn from data without being explicitly programmed. Deep learning is a subset of machine learning that uses artificial neural networks with multiple layers to model high-level concepts. In other words, deep learning is a more complex form of machine learning that can automatically learn and extract features from data, while machine learning requires features to be manually engineered.


## memory

In [69]:
from langchain.memory import ConversationBufferWindowMemory
from langchain.chains import ConversationalRetrievalChain

In [70]:
memory = ConversationBufferWindowMemory(
    k=5,
    memory_key="chat_history",
    output_key="answer",
    return_messages=True
)


chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory = memory,
        
)

In [71]:
question = "where is nepal located"
chain.invoke({"question":question})

{'question': 'where is nepal located',
 'chat_history': [],
 'answer': ' Nepal is a country located in South Asia, between China and India. It is not mentioned in the context, but the context does discuss several companies that are using machine learning technology in various ways to improve their businesses.'}

In [72]:
question = "what is supervised learning"
chain.invoke({"question":question})

{'question': 'what is supervised learning',
 'chat_history': [HumanMessage(content='where is nepal located'),
  AIMessage(content=' Nepal is a country located in South Asia, between China and India. It is not mentioned in the context, but the context does discuss several companies that are using machine learning technology in various ways to improve their businesses.')],
 'answer': '  Supervised learning is a type of machine learning method where the algorithm is trained using labeled examples. This means that the algorithm is provided with input data along with the correct output, allowing it to learn the relationship between the input and output. Supervised learning is commonly used in scenarios where historical data is used to predict events that are likely to occur in the future. Some common supervised learning techniques include regression, classification, gradient boosting, and prediction.'}

In [73]:
question = "why is it important"
chain.invoke({"question":question})

{'question': 'why is it important',
 'chat_history': [HumanMessage(content='where is nepal located'),
  AIMessage(content=' Nepal is a country located in South Asia, between China and India. It is not mentioned in the context, but the context does discuss several companies that are using machine learning technology in various ways to improve their businesses.'),
  HumanMessage(content='what is supervised learning'),
  AIMessage(content='  Supervised learning is a type of machine learning method where the algorithm is trained using labeled examples. This means that the algorithm is provided with input data along with the correct output, allowing it to learn the relationship between the input and output. Supervised learning is commonly used in scenarios where historical data is used to predict events that are likely to occur in the future. Some common supervised learning techniques include regression, classification, gradient boosting, and prediction.')],
 'answer': ' Supervised learning

In [None]:
question = "how is it different from unsupervised learning"
chain.invoke({"question":question})

In [None]:
my_dict={
    'value1':'this is the value 1',
    'value2':'this is the value 2'
}

In [None]:
for i in my_dict.values():
    print(i)