In [None]:
!pip install chromadb==0.4.18 langchain==0.0.349 openai==1.3.8 tiktoken==0.5.2 jq youtube-transcript-api pytube unstructured pypdfium2

In [None]:
#9.3 RAG using LangChain

api_key = ''

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import JSONLoader,YoutubeLoader,PyPDFium2Loader,TextLoader
from langchain.document_loaders.csv_loader import CSVLoader

In [None]:
#Text File
loader = TextLoader("dummy_data.txt")
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [None]:
qa.run('Summarize this document')

' This document describes the events of a battle between two opposing forces, one of which is ultimately defeated. The story begins with the start of a conspiracy-filled reverie and ends with the arrival of formidable reinforcements that help to pull back the losing side from the brink of defeat.'

In [None]:
loader = JSONLoader(
    file_path='One Piece json.json',
    text_content=False,
    jq_schema='.[].name')

data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())
qa.run('What are the most repeated word in names?')

' The most repeated word in the names is "O."'

In [None]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader('https://medium.com/data-science-in-your-pocket/best-prompt-engineering-hacks-to-know-549aaf57e55b')
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=6000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())



In [None]:
# Youtube Loader

loader = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=D0S2YOVyFUE",
    add_video_info=True,
    language=["en", "id"],
    translation="en",
)
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [None]:
qa.run('What this video about?')

'\nThis video appears to be about a conspiracy-filled reverie, with Pudding showing determination, Hawkings the Magician appearing, and a crisis involving Gear Four and Unstoppable Donuts.'

In [None]:
#PDF File

loader = PyPDFium2Loader("complain.pdf")
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [None]:
qa.run('explain the attached file')

' The attached file is Evidence202308120816104812734.jpeg - a digital image file that is part of a cyber crime incident report. The report includes details about a fake/impersonating profile on Facebook, and the file is likely a screenshot of the profile.'

In [None]:
#CSV file

loader = CSVLoader(file_path='ONE PIECE.csv')
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

In [None]:
qa.run('Which episode appears the most interesting?')

' Episode 898, The Headliner! Hawkings the Magician Appears!, with an average rating of 8.4 and a trend of 3.'

In [None]:
#9.4 Multi-document RAG

from langchain.agents.agent_types import AgentType
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.agents import AgentExecutor, initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain.tools import BaseTool, StructuredTool, tool

api_key=''

llm = OpenAI(openai_api_key=api_key)

def retriever_qa_creation(file_name):
      loader = TextLoader(file_name)
      documents = loader.load()
      text_splitter=CharacterTextSplitter(chunk_size=100,chunk_overlap=0)
      texts = text_splitter.split_documents(documents)
      embeddings = OpenAIEmbeddings(openai_api_key=api_key)
      db = Chroma.from_documents(texts, embeddings)
      qa=RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",
      retriever=db.as_retriever())
      return qa

retriever_qa1 = retriever_qa_creation('sample1.txt')
retriever_qa2 = retriever_qa_creation('sample2.txt')

@tool
def medium_tips(query: str)->str:
    """search to extract tips and tricks to write blogs on Medium"""
    return retriever_qa1.run(query)
@tool
def blog_tips(query: str)->str:
    """explains the pros and cons of writing blogs as a data scientist"""
    return retriever_qa2.run(query)

tools =[medium_tips,blog_tips]

memory = ConversationBufferMemory(memory_key="chat_history")
agent_chain=initialize_agent(tools,llm,agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION, verbose=True,memory=memory)

print(agent_chain.run({'input':'Why should Data Scientists blog?'}))
print(agent_chain.run({'input':'How to get started on Medium?'}))


In [None]:
#9.5 Recommendation System using RAG

import numpy as np
import pandas as pd

num_users = 1000
num_items = 20

user_ids = np.arange(1, num_users + 1)
item_ids = np.arange(1, num_items + 1)

data = {
    'user_id': np.random.choice(user_ids, size=num_users * 10),
    'item_id': np.random.choice(item_ids, size=num_users * 10),
}

# Create a pandas DataFrame from the data
df = pd.DataFrame(data).drop_duplicates()

# Display the first few rows of the generated data
print(df.head())


df = df.groupby(['user_id'])['item_id'].agg(list).reset_index()
df['item_id'] = df['item_id'].transform(lambda x: [0 if y+1 not in x else y+1 for y in range(20)])

df.to_csv('dummy_data.csv',index=False)

from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path="dummy_data.csv")
data = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
llm = OpenAI(openai_api_key=api_key)
docsearch = Chroma.from_documents(texts, embeddings)

qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever())

qa.run('Suggest 2 articles to user-id 78 using given data which it has not seen.\
 Follow this approach 1: Find similar Users and 2: sugest new articles from similar users.\
  Also give a reason for suggestion').split('.')



In [None]:
#9.6 Vector Databases
!pip install sentence_transformers

import chromadb
from chromadb.utils import embedding_functions

chroma_client = chromadb.Client()

sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")

collection = chroma_client.create_collection(name="dummy_data")

with open('dummy_data.txt', 'r') as file:
    data = file.read().replace('\n','.').split('.')

collection.add(
  documents = data,
  embeddings = sentence_transformer_ef(data),
  ids = ['id'+str(x) for x in range(len(data))]
)

results = collection.query(
    query_texts=["Where did Alexandar go?"],
    n_results=5
)