# PPT Generator

In [22]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import textwrap
from langchain_community.document_loaders import Docx2txtLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
import google.generativeai as genai
import glob
from docx import Document

In [2]:
load_dotenv()

True

In [3]:
google_api_key = os.getenv("GOOGLE_API_KEY")

In [4]:
genai.configure(api_key=google_api_key)

In [5]:
# model = genai.GenerativeModel('gemini-pro')

In [6]:
from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

#### Functions For Document DOCX


Go through the below functions

In [None]:

def extract_text_from_each_docx(file):
    document  = Document(file)
    # content = []
    content = [para.text.strip() for para in document.paragraphs if para.text.strip()]

    # for para in document.paragraphs:
    #     content.append(para.text)
    return content



In [35]:
#Accumlated Texts From Each Documents
def acc_texts_from_docs(folder_path):
    doc_files = glob.glob(os.path.join(folder_path,"*.docx"))
    all_text = ""
    for file in doc_files:
        text = extract_text_from_each_docx(file)
        all_text += '\n'.join(text) + '\n'
    return all_text

In [32]:
def get_doc_chunks(text):
    text_splitter =  RecursiveCharacterTextSplitter(chunk_size =10000,chunk_overlap = 1000)
    chunks = text_splitter.split_text(text)
    return chunks
    
def save_embeddings_to_local(chunks):

    # Replace below with AzureOpenAI Embeddings
    embedding = GoogleGenerativeAIEmbeddings(
                    model = 'models/embedding-001', 
                    task_type='retrieval_document'
                                                )
    db = Chroma.from_texts(chunks,embedding= embedding,persist_directory="./Data/tempchroma_db")
    # FAISS
    # vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    # vector_store.save_local("temp_docs_embed")

In [49]:
#MAIN FUNCTION

def docx_rag(folder_path):
    all_text = acc_texts_from_docs(folder_path)
    chunks = get_doc_chunks(all_text)
    save_embeddings_to_local(chunks)

def get_similar_docs_from_query(query,embedding_function):
    db = Chroma(persist_directory="./Data/tempchroma_db", embedding_function=embedding_function)
    docs = db.similarity_search(query)
    return docs

In [41]:
docx_rag(folder)

In [54]:
embedding = GoogleGenerativeAIEmbeddings(
                    model = 'models/embedding-001', 
                    task_type='retrieval_document'
                                                )
query = "Major Actions Performed By Spotify in 2022"

sdocs = get_similar_docs_from_query(query=query,embedding_function=embedding)

In [None]:
def get_chain():
    prompt_template = """
    Act as if you are the Content Gatherer for making a powerpoint presentation who extracts the data in brief from the given context below according to the query provided. Present this information in the form of points. Do not include any other information which does not contains similar kind of data like the query.Ensure that each slide contains concise and relevant information to support the main points. If you do not find the information, please don't return anything.
    Context : \n {context} \n
    Query : \n {query} \n
    Similar Content :
    
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3)
    prompt = PromptTemplate(template=prompt_template,input_variables=["context","query"])
    chain = load_qa_chain(model,chain_type="stuff",prompt = prompt)
    return chain
    
    

In [53]:
chain  = get_chain()
response = chain(
    {"input_documents":sdocs,"query":query},
    return_only_outputs = True
)
to_markdown(response['output_text'])


> - PUMA provided safe accommodation and welcomed many PUMA employees and their families in Germany and Poland during the war in Ukraine.
> - PUMA was named a Top Employer 2022 in several regions of the world.
> - PUMA launched its PUMA Shopping App in India, the United States, the UK and Japan.
> - PUMA unveiled its largest Web3 collaboration to date with 10KTF and launched “PUMA and the Land of Games” on the online gaming platform Roblox.
> - PUMA announced a long-term partnership with British-Nigerian rapper, and record producer Skepta.
> - PUMA invited industry peers, activists, NGOs, experts, ambassadors, and consumers to London for its sustainability event Conference of the People.

In [55]:
chain  = get_chain()
response = chain(
    {"input_documents":sdocs,"query":query},
    return_only_outputs = True
)
to_markdown(response['output_text'])

> - Spotify published its long-standing Platform Rules to ensure a safe and enjoyable experience for users.
> - Spotify established the Spotify Safety Advisory Council (SAC) to inform and evolve policies and products that keep users safe while respecting creator expression.
> - Spotify acquired Kinzen, a global leader in protecting online communities, to improve its ability to detect and address abusive content and misinformation.
> - Spotify rolled out a content advisory label on podcasts that discuss COVID-19 to connect listeners with more information.
> - Spotify published an overview of its Algorithmic Impact Assessment (AIA) process, which has been used to assess over 100 internal systems.
> - Spotify reinforced its commitment to ethical business practices through its Code of Conduct and Ethics (Code), which prohibits bribery and corruption, conflicts of interest, discrimination, harassment, and retaliation, misuse of data, insider trading, and supplier misconduct.
> - Spotify implemented a Global Policy Review, requiring employees to annually review and acknowledge their compliance with the Code and key global policies.
> - Spotify established the Spotify Ethics Line to allow employees to raise compliance concerns comfortably and without fear of reprisal.

# Below Code Only For Testing

In [14]:
# def list_docx_files(folder_path):
#     doc_files = glob.glob(os.path.join(folder_path,"*.docx"))
#     return doc_files

In [15]:
# files = list_docx_files(folder)

In [12]:
def generate_similar_docs_from_docx(query,folder_path):
    model = 'models/embedding-001'
    list_docs = []
    doc_files = glob.glob(os.path.join(folder_path,"*.docx"))
    for file in doc_files:
        loader  = Docx2txtLoader(file)
        data = loader.load()
        text_splitter =  RecursiveCharacterTextSplitter(chunk_size =10000,chunk_overlap = 1000)
        chunks = text_splitter.split_text(data[0].page_content)
        embedding = GoogleGenerativeAIEmbeddings(
                    model = 'models/embedding-001', 
                    task_type='retrieval_document'
                                                )
        db = Chroma.from_texts(chunks,embedding= embedding,persist_directory="./Data/tempchroma_db")
        docs = db.similarity_search(query)
        list_docs.append({'File Name':file,'SimilarDocs':docs})
    return pd.DataFrame(list_docs)

In [46]:
def get_chain():
    prompt_template = """
    Act as if you are the Content Gatherer for making a powerpoint presentation who extracts the data in brief from the given context below according to the query provided. Present this information in the form of points. Do not include any other information which does not contains similar kind of data like the query.Ensure that each slide contains concise and relevant information to support the main points. If you do not find the information, please don't return anything.
    Context : \n {context} \n
    Query : \n {query} \n
    Similar Content :
    
    """
    model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3)
    prompt = PromptTemplate(template=prompt_template,input_variables=["context","query"])
    chain = load_qa_chain(model,chain_type="stuff",prompt = prompt)
    return chain
    
    

In [17]:
query = "Major Efforts Taken By PUMA in 2022"
folder = "./Data/"
df = generate_similar_docs_from_docx  (query,folder)

In [22]:
#Sample to check with the first DOCX Document to get the similar content in the form of bullet points 
chain  = get_chain()
response = chain(
    {"input_documents":df.SimilarDocs[0],"query":query},
    return_only_outputs = True
)
to_markdown(response['output_text'])


  warn_deprecated(


> - PUMA provided safe accommodation and welcomed many PUMA employees and their families in Germany and Poland during the war in Ukraine.
> - PUMA was named a Top Employer 2022 in several regions of the world for its efforts to provide an attractive workplace.
> - PUMA's brand heat was boosted by the great performances of its athletes, including Shericka Jackson, Armand "Mondo" Duplantis, and Pedro Pichardo.
> - PUMA launched its largest Web3 collaboration to date with 10KTF and launched "PUMA and the Land of Games" on the online gaming platform Roblox.
> - PUMA launched its PUMA Shopping App in India, the United States, the UK, and Japan.
> - PUMA announced a long-term partnership with British-Nigerian rapper and record producer Skepta.
> - PUMA invited industry peers, activists, NGOs, experts, ambassadors, and consumers to London for its sustainability event Conference of the People.

In [38]:
def extract_text_from_each_docx(file):
    document  = Document(file)
    # content = []
    content = [para.text.strip() for para in document.paragraphs if para.text.strip()]

    # for para in document.paragraphs:
    #     content.append(para.text)
    return content

In [39]:
def acc_texts_from_docs(folder_path):
    doc_files = glob.glob(os.path.join(folder_path,"*.docx"))
    all_text = ""
    for file in doc_files:
        text = extract_text_from_each_docx(file)
        all_text += '\n'.join(text) + '\n'
    return all_text
        
    

In [None]:
def get_doc_chunks(text):
    text_splitter =  RecursiveCharacterTextSplitter(chunk_size =10000,chunk_overlap = 1000)
    chunks = text_splitter.split_text(text)
    return chunks
    
def save_embeddings_to_local(chunks):

    # Replace below with AzureOpenAI Embeddings
    embedding = GoogleGenerativeAIEmbeddings(
                    model = 'models/embedding-001', 
                    task_type='retrieval_document'
                                                )
    db = Chroma.from_texts(chunks,embedding= embedding,persist_directory="./Data/tempchroma_db")
    # FAISS
    # vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    # vector_store.save_local("temp_docs_embed")

    
    

In [40]:
acc_texts_from_docs(folder)

"OVERVIEW 2022\nNotes relating to forward-looking statements\nThis document contains statements about the future business development and strategic direction of the Company. The forward-looking statements are based on management's current expectations and assumptions. They are subject to certain risks and fluctuations as described in other publications, in particular in the risk and opportunities management section of the combined management report. If these expectations and assumptions do not apply or if unforeseen risks arise, the actual course of business may differ significantly from the expected developments. We therefore assume no liability for the accuracy of these forecasts.\nThese sections contain content or cross-references not required by law, which were not audited by the auditor, but were merely read critically. In the case of cross- references, the information to which the cross- references refer was also not audited.\nHUBERT HINTERSEHER\nCHIEF FINANCIAL OFFICER PUMA\nIn 

# Below Is For My Testing Purposes 


Dont refer this part below

In [10]:
loader  = Docx2txtLoader(files[0])
data = loader.load()

text_splitter =  RecursiveCharacterTextSplitter(chunk_size =10000,chunk_overlap = 1000)
chunks = text_splitter.split_text(data[0].page_content)

model = 'models/embedding-001'

embedding = GoogleGenerativeAIEmbeddings(
    model = model, 
    task_type='retrieval_document'
)
db = Chroma.from_texts(chunks,embedding= embedding,persist_directory="./Data/tempchroma_db")


NameError: name 'files' is not defined

In [51]:
# Work With Query Part

docs = db.similarity_search(query)




'- PUMA was named a Top Employer 2022 in several regions of the world, including Europe and Asia/Pacific.\n- PUMA was listed by Forbes among the World’s Best Employers and by the Financial Times as a Leader in Diversity for Europe.\n- PUMA was named “Company of the Year” at the German Diversity Awards and in Mexico PUMA was among the best places to work for women in Expansión’s ranking “Súper Empresas Para Mujeres”.\n- PUMA launched its PUMA Shopping App in India, the United States, the UK and Japan.\n- PUMA announced a long-term partnership with British-Nigerian rapper, and record producer Skepta, who will design product and be a part of global marketing campaigns.\n- PUMA invited industry peers, activists, NGOs, experts, ambassadors, and consumers to London for its sustainability event Conference of the People, an open conversation about sustainability, which put the concerns of Gen Z at the center of the debate.\n- PUMA was honored when industry publication Business of Fashion named

In [52]:
to_markdown(response['output_text'])

> - PUMA was named a Top Employer 2022 in several regions of the world, including Europe and Asia/Pacific.
> - PUMA was listed by Forbes among the World’s Best Employers and by the Financial Times as a Leader in Diversity for Europe.
> - PUMA was named “Company of the Year” at the German Diversity Awards and in Mexico PUMA was among the best places to work for women in Expansión’s ranking “Súper Empresas Para Mujeres”.
> - PUMA launched its PUMA Shopping App in India, the United States, the UK and Japan.
> - PUMA announced a long-term partnership with British-Nigerian rapper, and record producer Skepta, who will design product and be a part of global marketing campaigns.
> - PUMA invited industry peers, activists, NGOs, experts, ambassadors, and consumers to London for its sustainability event Conference of the People, an open conversation about sustainability, which put the concerns of Gen Z at the center of the debate.
> - PUMA was honored when industry publication Business of Fashion named PUMA the most sustainable brand in a ranking of the 30 largest companies in the business and when PUMA received the Footwear News Sustainability Leadership Award.

In [11]:
def extract_content_from_files(folder_path):
    frame = []
    doc_files = glob.glob(os.path.join(folder_path,"*.docx"))
    for file in doc_files:
        loader  = Docx2txtLoader(file)
        data = loader.load()

        text_splitter =  RecursiveCharacterTextSplitter(chunk_size =10000,chunk_overlap = 1000)
        chunks = text_splitter.split_text(data[0].page_content)
        
        # content = ""
        # for para in document.paragraphs:
        #     content += para.text + "\n"
        embeddings = process_in_batches(content,1000) 
        frame.append({'File Name':file,'Content':content,'Embeddings':embeddings})
    return pd.DataFrame(frame)

In [14]:
def create_embeddings(text):
    model = 'models/embedding-001'
    return genai.embed_content(
        model = model,
        content=text,
        task_type='retrieval_document',
    )["embedding"]
    
    

In [15]:
#process in batches

def process_in_batches(chunks, batch_size):
    start_index = 0
    embeddings = []
    while start_index < len(chunks):
        end_index = start_index + batch_size
        if start_index == 0:
            data = chunks[start_index:end_index]
            embeddings.append(create_embeddings(data))
        else:
            if end_index > len(chunks):
                data = chunks[start_index:len(chunks)]
            else:
                data = chunks[start_index:end_index]
            embeddings.append(create_embeddings(data))
        start_index = end_index
        return embeddings


In [16]:
data =extract_content_from_files(folder)

<module 'langchain_community.vectorstores.chroma' from 'C:\\Users\\imbit\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\langchain_community\\vectorstores\\chroma.py'>

Asking the Query and Checking the similarities

"OVERVIEW 2022\n\nNotes relating to forward-looking statements\n\nThis document contains statements about the future business development and strategic direction of the Company. The forward-looking statements are based on management's current expectations and assumptions. They are subject to certain risks and fluctuations as described in other publications, in particular in the risk and opportunities management section of the combined management report. If these expectations and assumptions do not apply or if unforeseen risks arise, the actual course of business may differ significantly from the expected developments. We therefore assume no liability for the accuracy of these forecasts.\n\n\n\n\n\n\n\nThese sections contain content or cross-references not required by law, which were not audited by the auditor, but were merely read critically. In the case of cross- references, the information to which the cross- references refer was also not audited.\n\n\n\n\n\n\n\n\n\n➔HUBERT HINTERSEH

In [19]:
def get_similar_answer(query,dataframe):
    query_embedding = genai.embed_content(
        model = model,
        content=query,
        task_type='retrieval_query',
    )
    # Using dot product of stacked/accu,lated dataframe embeddings and query embedding
    dot_products = np.dot(np.stack(dataframe['Embeddings']),query_embedding["embedding"])
    idxs = np.argmax(dot_products)
    
    
    

In [20]:
 dot_products = np.dot(np.stack(data['Embeddings']),request["embedding"])

In [21]:
dot_products

array([[0.66765165]])