In [None]:
!pip install streamlit pyngrok PyPDF2 langchain langchain_community transformers easyocr Pillow torchaudio python-dotenv langchain_groq InstructorEmbedding streamlit pypdf2 langchain
!pip install langchain_core
!pip install langchain_community
!pip install python-dotenv
!pip install faiss-cpu
!pip install huggingface_hub
!pip install InstructorEmbedding
!pip install sentence-transformers==2.2.2
!pip install torch
!pip install langchain_groq
!pip install transformers
!pip install torchaudio
!pip install pillow


Collecting streamlit
  Downloading streamlit-1.37.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.2.12-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.2.11-py3-none-any.whl.metadata (2.7 kB)
Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting langchain_groq
  Downloading langchain_groq-0.1.9-py3-none-any.whl.metadata (2.9 kB)
Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl.metadata (20 kB)
Collecting tenacity<9,>=8.1.0 (from streamlit)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Down

In [None]:
!pip install PyPDF2 easyocr



In [None]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import HuggingFaceInstructEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.memory import ConversationBufferMemory
from transformers import pipeline
from langchain.chains import ConversationalRetrievalChain
from langchain_groq import ChatGroq
import easyocr
from PIL import Image
import torchaudio
import numpy as np

# Hard-coded GROQ API key
groq_api_key = "your key"

# Extraction of text from PDFs
def get_pdf_text(pdf_paths):
    text = ""
    for pdf_path in pdf_paths:
        try:
            pdf_reader = PdfReader(pdf_path)
            for page in pdf_reader.pages:
                text += page.extract_text() or ""
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
    return text

# Dividing the raw text into different chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Creating a vector store embeddings from HuggingFace
def get_vectorstore(text_chunks):
    if not text_chunks:
        print("No text chunks to create embeddings.")
        return None

    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

# Creating a conversation chain to store the context for follow-up questions
def get_conversation_chain(vectorstore, groq_api_key):
    llm = ChatGroq(groq_api_key=groq_api_key, model_name="mixtral-8x7b-32768")
    memory = ConversationBufferMemory(
        memory_key='chat_history', return_messages=True)
    conversation_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vectorstore.as_retriever(),
        memory=memory
    )
    return conversation_chain

# Handling the user input
def handle_userinput(conversation_chain, user_question):
    response = conversation_chain({'question': user_question})
    chat_history = response['chat_history']

    for i, message in enumerate(chat_history):
        if i % 2 == 0:
            print(f"User: {message.content}")
        else:
            print(f"Bot: {message.content}")

# Extract text from images using easyocr
def get_image_text(image_paths):
    reader = easyocr.Reader(['en'])
    text = ""
    for image_path in image_paths:
        try:
            image = Image.open(image_path)
            image_np = np.array(image)
            ocr_result = reader.readtext(image_np, detail=0)
            text += ' '.join(ocr_result) + "\n"
        except Exception as e:
            print(f"Error reading image {image_path}: {e}")
    return text

# Extract text from audio
def get_audio_text(audio_paths):
    text = ""
    asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-large-960h", device = 0)
    for audio_path in audio_paths:
        try:
            waveform, sample_rate = torchaudio.load(audio_path, normalize=True)

            # Convert to mono if stereo
            if waveform.shape[0] == 2:
                waveform = waveform.mean(dim=0, keepdim=True)

            # Ensure the waveform is in the correct format
            waveform = waveform.numpy()
            transcription = asr_pipeline(waveform)

            if 'text' in transcription:
                text += transcription['text']
            else:
                print(f"Could not extract text from audio: {audio_path}")
        except Exception as e:
            print(f"Error processing audio {audio_path}: {e}")
    return text

# Main function
def main():
    print("Welcome to the File Processor!")

    while True:
        print("\nChoose the type of file to process:")
        print("1. PDF")
        print("2. Image")
        print("3. Audio")
        print("4. Exit")

        choice = input("Enter the number corresponding to your choice: ")

        if choice == '1':
            pdf_paths = input("Enter the paths to PDF files (separated by commas): ").split(',')
            pdf_paths = [path.strip() for path in pdf_paths]
            raw_text = get_pdf_text(pdf_paths)

        elif choice == '2':
            image_paths = input("Enter the paths to image files (separated by commas): ").split(',')
            image_paths = [path.strip() for path in image_paths]
            raw_text = get_image_text(image_paths)

        elif choice == '3':
            audio_paths = input("Enter the paths to audio files (separated by commas): ").split(',')
            audio_paths = [path.strip() for path in audio_paths]
            raw_text = get_audio_text(audio_paths)

        elif choice == '4':
            print("Exiting...")
            break

        else:
            print("Invalid choice. Please try again.")
            continue

        # Get text chunks
        text_chunks = get_text_chunks(raw_text)

        # Create vector store with embeddings
        vectorstore = get_vectorstore(text_chunks)

        if vectorstore:
            # Create conversation chain
            conversation_chain = get_conversation_chain(vectorstore, groq_api_key)

            while True:
                user_question = input("Ask something (type 'exit' to quit): ")
                if user_question.lower() in ['exit' , 'quit']:
                    break
                handle_userinput(conversation_chain, user_question)

if __name__ == '__main__':
    main()


Welcome to the File Processor!

Choose the type of file to process:
1. PDF
2. Image
3. Audio
4. Exit
Enter the number corresponding to your choice: 1
Enter the paths to PDF files (separated by commas): /content/Aurelien-Geron-Hands-On-Machine-Learning-with-Scikit-Learn-Keras-and-Tensorflow_-Concepts-Tools-and-Techniques-to-Build-Intelligent-Systems-OReilly-Media-2019.pdf
Error reading PDF /content/Aurelien-Geron-Hands-On-Machine-Learning-with-Scikit-Learn-Keras-and-Tensorflow_-Concepts-Tools-and-Techniques-to-Build-Intelligent-Systems-OReilly-Media-2019.pdf: EOF marker not found
No text chunks to create embeddings.

Choose the type of file to process:
1. PDF
2. Image
3. Audio
4. Exit
Enter the number corresponding to your choice: 1
Enter the paths to PDF files (separated by commas): /content/Aurelien-Geron-Hands-On-Machine-Learning-with-Scikit-Learn-Keras-and-Tensorflow_-Concepts-Tools-and-Techniques-to-Build-Intelligent-Systems-OReilly-Media-2019.pdf


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512
Ask something (type 'exit' to quit): what is this book about


  warn_deprecated(


User: what is this book about
Bot: Based on the context provided, the book being referred to is the second edition of a machine learning (ML) book written by the author of the popular Keras library, François Chollet. The book is called "Deep Learning with Python" and is published by Manning. The author describes it as a very practical book that covers a wide range of topics in a clear and concise way, with a focus on code examples over mathematical theory. The book also includes a review of the book by the author himself, where he highlights its conciseness, clarity, and depth, similar to the Keras library. The book is intended to be accessible to a large audience, and the author encourages readers to share their experiences with the book and report any errors they find.
Ask something (type 'exit' to quit): does this book explains the content about deep learning and reinforcement learning
User: what is this book about
Bot: Based on the context provided, the book being referred to is th

KeyboardInterrupt: Interrupted by user