# Language Model Pipeline for Transcript Processing

This pipeline loads and processes a collection of transcripts, splits them into chunks, and embeds them using OpenAI embeddings. The embeddings are then stored in a Pinecone vector store for efficient querying and retrieval.

# 1. Install Dependencies
Install the necessary libraries and packages for the pipeline.

In [None]:
!pip install langchain -q
!pip install openai -q
!pip install unstructured -q
!pip install unstructured[local-inference] -q
!pip install detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2 -q
!pip install poppler-utils -q
!pip install tiktoken -q
!pip install pinecone-client -q
!pip install google-cloud-secret-manager


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.5/46.5 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.5/471.5 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... 

# 2. Import Modules
Import the required modules and classes for the pipeline.

In [None]:
import os
import openai
import pinecone
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive


  from tqdm.autonotebook import tqdm



Mount your google drive is the root of your google drive with the directories (MyDrive and Shareddrives)

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


This is a function that loads secrets from Google's secrets manager

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string


Load each secret individually

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
pinecone_api_key = load_secrets("pinecone_api_key",project_id)
#pinecone_environment = load_secrets("pinecone_environment",project_id)
#pinecone_index_name = load_secrets("pinecone_index",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)

This is the code to enable Langsmith (Langchain tracing)

In [None]:
# connect notebook to langsmith
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

# This key is sourced from vars.env
os.environ['LANGCHAIN_API_KEY'] = langsmith_api_key  # Uncomment and replace  with your actual API key

os.environ['LANGCHAIN_PROJECT'] = 'GAI-Demo-Agents_Combined_with_vector_stores_as_tools'

# To verify, you can print the variables
print(os.environ.get('LANGCHAIN_TRACING_V2'))
print(os.environ.get('LANGCHAIN_ENDPOINT'))
#print(os.environ.get('LANGCHAIN_API_KEY'))  # Uncomment if you want to print your API key (be careful with sharing your notebook)
print(os.environ.get('LANGCHAIN_PROJECT'))

true
https://api.smith.langchain.com
GAI-Demo-Agents_Combined_with_vector_stores_as_tools


# 3. Set API Keys and Initialize Embeddings
Set your OpenAI API key and initialize the OpenAI embeddings.

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# 4. Define Functions for Loading and Processing

Define a helper function to load and process documents from a given directory using the DirectoryLoader class.

In [None]:
def load_and_process_docs(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    processed_documents = []
    for index, document in enumerate(documents):
        print(f"Document {index}: Successfully loaded and processed {document}")
        processed_documents.append(document)
    return processed_documents

# 5. Define Function for Splitting Documents
Define a function to split documents into chunks using the RecursiveCharacterTextSplitter.

In [None]:
def split_docs(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

This function is for debbugging chunck sizes

In [None]:
def show_chunks(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)

    # Get the first 5 chunks
    first_five_chunks = docs[:5]

    # Print each chunk and its length
    for idx, chunk in enumerate(first_five_chunks, start=1):
        print(f"Chunk {idx}:")
        print(chunk)
    return


Load the transcript

In [None]:
directory = "/content/gdrive/MyDrive/GAI/transcripts"
documents = load_and_process_docs(directory)
docs = split_docs(documents)
print(f"{len(docs)} chunks successfully loaded and processed")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


 delete 788 chunks successfully loaded and processed


Slit and dump chunks with the function defaults

In [None]:
show_chunks(docs)

Chunk 1:
page_content='Advanced Praise\n\n“A thoroughly entertaining and educational look at Deming, a man whose insights are fundamental to modern software development. This book includes delightful stories of those around Deming who influenced his work and helped create the foundation for agile and DevOps."\n\n—Jim Whitehurst, Senior Advisor at IBM\n\n“John Willis has looked deep into Dr. Deming’s history and teachings and profoundly explains his profound knowledge. Read this book and you will understand like never before the wisdom and sound advice of Dr. Deming.”\n\n—Jeffrey K. Liker, author of The Toyota Way' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 2:
page_content="—Jeffrey K. Liker, author of The Toyota Way\n\n“John Willis has crafted a delightful and insightful book that brings to life the personal story of W. Edwards Deming and how he created a tremendous impact in the world through his theory of profound knowledge. In this 

In [None]:
show_chunks(docs, chunk_size=500, chunk_overlap=20)

Chunk 1:
page_content='Advanced Praise\n\n“A thoroughly entertaining and educational look at Deming, a man whose insights are fundamental to modern software development. This book includes delightful stories of those around Deming who influenced his work and helped create the foundation for agile and DevOps."\n\n—Jim Whitehurst, Senior Advisor at IBM' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 2:
page_content='“John Willis has looked deep into Dr. Deming’s history and teachings and profoundly explains his profound knowledge. Read this book and you will understand like never before the wisdom and sound advice of Dr. Deming.”\n\n—Jeffrey K. Liker, author of The Toyota Way' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 3:
page_content='—Jeffrey K. Liker, author of The Toyota Way' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 4:
page_content="“J

In [None]:
show_chunks(docs, chunk_size=250, chunk_overlap=20)

Chunk 1:
page_content='Advanced Praise' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 2:
page_content='“A thoroughly entertaining and educational look at Deming, a man whose insights are fundamental to modern software development. This book includes delightful stories of those around Deming who influenced his work and helped create the foundation for' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 3:
page_content='the foundation for agile and DevOps."' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 4:
page_content='—Jim Whitehurst, Senior Advisor at IBM' metadata={'source': '/content/gdrive/MyDrive/GAI/transcripts/DEM_Interior.6.24.23.pdf'}
Chunk 5:
page_content='“John Willis has looked deep into Dr. Deming’s history and teachings and profoundly explains his profound knowledge. Read this book and you will understand like never before the wisdom a

# 6. Load and Process Documents

Load and process the documents from the ./Transcripts directory, then split them into chunks.

# 7. Initialize Pinecone and Create Index
Initialize Pinecone with your API key and environment, then create an index using the Pinecone.from_documents() method.

In [None]:
pinecone.init(
    api_key=pinecone_api_key,  # find at app.pinecone.io
    environment="asia-southeast1-gcp-free"  # next to api key in console
)

index = pinecone.Index('gai-test1')
index_name = "gai-test1"

print(index_name)
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
pinecone_vectore_store = Pinecone.from_existing_index(index_name, embeddings)

gai-test1


# 8. Define the Question-Answering Function
Define a function to perform question-answering tasks using the Pinecone index created in the previous steps. The function takes a query, a chain type, and a value for k (the number of similar chunks to consider) as input and returns the best answer along with the relevant source documents.

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

def qa(query, chain_type, k, index=index, openai_api_key=openai.api_key):

    # Expose the Pinecone index in a retriever interface
    retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": k})

    # Create a chain to answer questions
    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(openai_api_key=openai_api_key),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
    )

    result = qa_chain({"query": query})
    print(result['result'])
    return result


# 9. Example Usage of the Question-Answering Function

Below is an example of how to use the qa() function to answer a question about HRMC (Deming's Journey to Profound Knowledge) using the diferent chain types.

In [None]:
result = qa("Summarize the book", "stuff", 5)

 The book is a modern retelling of the bestselling business book The Goal, a novel about supply-chain management in a manufacturing setting. The Phoenix Project provides a modern lens on the topic, focusing on software development and delivery. It includes DevOps in the subtitle. After the book is published, the authors wrote The DevOps Handbook to provide a prescriptive solution to readers. The two books have a combined sales of almost a million copies.


In [None]:
result = qa("Summarize the book", "refine", 5)



The Phoenix Project is a modern retelling of the bestselling business book The Goal, focusing on software development and delivery. The book follows the story of W. Edwards Deming, the author of Deming's masterwork, as he shared it with the world at the age of ninety-three just before his death. Imagine publishing your magnum opus at that age, just before your death - this gives you a clue as to the kind of man Deming was. Along with The DevOps Handbook, which was written by Gene Kim, Patrick Debois, Jez Humble and the author, the books have sold almost a million copies combined. This modern retelling of The Goal is essential reading for anyone who wants to understand the importance of quality and uniformity in the world of software development and delivery.


In [None]:
result = qa("Summarize the book", "map_reduce", 5)

 This book is a portrait of W. Edwards Deming and John Willis, two soft-spoken experts in this century's biggest advancements. It captures the essence of how to improve most any business, with a focus on quality and understanding of the people responsible for that quality. It also gives a portrait of the hard-working, humble man from Wyoming who helped revolutionize manufacturing in Japan and around the world.


## 10. Language Model Pipeline for Querying Pinecone Datastore
This pipeline leverages a Pinecone vector store which contains embedded documents (transcripts, in this case) for efficient querying and retrieval. The main function, get_answer, retrieves relevant documents in response to a prompt and uses a question-answering pipeline to produce a refined answer from these documents.

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

def qa_from_vector_db(query, chain_type, k, openai_api_key=openai.api_key):

    # Expose the Pinecone index in a retriever interface
    #retriever = index.as_retriever(search_type="similarity", search_kwargs={"k": k})
    pinecone_vector_store = Pinecone.from_existing_index(index_name, embeddings)

    # Create a chain to answer questions
    qa_chain = RetrievalQA.from_chain_type(
        llm=OpenAI(openai_api_key=openai_api_key),
        chain_type=chain_type,
        retriever=pinecone_vector_store.as_retriever(),
        return_source_documents=True,
    )

    result = qa_chain({"query": query})
    print(query)
    print(result['result'])
    return result

In [None]:
qa_from_vector_db("what were the top 5 things about the book","refine",6)

what were the top 5 things about the book


1. It is a biography of W. Edwards Deming, a man who published his magnum opus at the age of 93.
2. It tells the story behind Deming's magnum opus and provides insight into the power of his work, the concept of quality and uniformity as foundations of commerce, prosperity, and peace.
3. It explores the life of an extraordinary man and the impact his work had on the world, including his ideas, quotations, and paraphrases attributed to different thinkers and industry leaders.
4. It was published when Deming was 93 years old, just before his death.
5. It was written by acclaimed author Michael Lewis, who is known for his biographies, and is available in eBook, Web PDF, and Audio formats.


{'query': 'what were the top 5 things about the book',
 'result': "\n\n1. It is a biography of W. Edwards Deming, a man who published his magnum opus at the age of 93.\n2. It tells the story behind Deming's magnum opus and provides insight into the power of his work, the concept of quality and uniformity as foundations of commerce, prosperity, and peace.\n3. It explores the life of an extraordinary man and the impact his work had on the world, including his ideas, quotations, and paraphrases attributed to different thinkers and industry leaders.\n4. It was published when Deming was 93 years old, just before his death.\n5. It was written by acclaimed author Michael Lewis, who is known for his biographies, and is available in eBook, Web PDF, and Audio formats.",
 'source_documents': [Document(page_content='One of my favorite authors is Michael Lewis. When reading Moneyball, for example, you think you’re reading a book about baseball statistics, but by the time you finish, you find that y

In [None]:
qa_from_vector_db("Tell me more about the book","stuff",6)

Tell me more about the book
 The book is The DevOps Handbook, published in 2016. It was written by Gene Kim, Patrick Debois, Jez Humble, and George Spafford. The book is about software development and delivery, and includes DevOps in the subtitle. It provides a prescriptive solution after reading it. It has sold almost a million copies. The ISBNs are 9781950508839, 9781950508846, 9781950508853, and 9781950508860.


{'query': 'Tell me more about the book',
 'result': ' The book is The DevOps Handbook, published in 2016. It was written by Gene Kim, Patrick Debois, Jez Humble, and George Spafford. The book is about software development and delivery, and includes DevOps in the subtitle. It provides a prescriptive solution after reading it. It has sold almost a million copies. The ISBNs are 9781950508839, 9781950508846, 9781950508853, and 9781950508860.',
 'source_documents': [Document(page_content='One of my favorite authors is Michael Lewis. When reading Moneyball, for example, you think you’re reading a book about baseball statistics, but by the time you finish, you find that you’ve read a biography of Billy Beane. Similarly, while this book may look like a biography of Deming, it’s the story behind the story of his masterwork, which he shared with the world when he was nine- ty-three years old. Imagine publishing your magnum opus at that age, just before your death. That gives you a clue as to the