In [1]:
import os
import PyPDF2
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.rankers import TransformersSimilarityRanker
from gtts import gTTS
from fastrag.generators.openvino import OpenVINOGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize InMemoryDocumentStore
document_store = InMemoryDocumentStore()

#future thought: create a folder that accepts amd store one or more pdf,
#so the link doesnt have to be different all the time
# Path to your single PDF document 
pdf_path = r"C:\Users\imagi\fastRAG\data\anti-bribery_anti-corruption_policy_feb_2022_finalwebsite.pdf"

# Function to extract text from a PDF file
def extract_pdf_text(pdf_path):
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

# Extract text from the PDF
text = extract_pdf_text(pdf_path)

# Create a document object
document = {
    'id': os.path.basename(pdf_path),  # Use the filename as document id
    'text': text,
    'title': os.path.basename(pdf_path)
}

# Add the document to the InMemoryDocumentStore
doc = Document(id=document['id'], content=document['text'], meta={"title": document['title']})
document_store.write_documents([doc])

print(f"Document '{document['title']}' has been successfully processed and added to the document store.")

retriever = InMemoryBM25Retriever(document_store=document_store)
ranker = TransformersSimilarityRanker()


Document 'anti-bribery_anti-corruption_policy_feb_2022_finalwebsite.pdf' has been successfully processed and added to the document store.


In [3]:
# Build a RAG pipeline
prompt_template = """
Given these documents, answer the question.
Documents:
{% for doc in documents %}
    {{ doc.text }}
{% endfor %}
Question: {{query}}
Answer:
"""

In [4]:
openvino_compressed_model_path = r"C:\Users\imagi\fastRAG\ov_model"
generator = OpenVINOGenerator(
    model="meta-llama/Llama-3.2-3B",
    compressed_model_dir=openvino_compressed_model_path,
    device_openvino="CPU",
    task="text-generation",
    generation_kwargs={
        "max_new_tokens": 200,
        "temperature": 0.3,
    }
)


In [5]:
pipe = Pipeline()

pipe.add_component("retriever", retriever)
pipe.add_component("ranker", ranker)
pipe.add_component("prompt_builder", PromptBuilder(template=prompt_template))
pipe.add_component("llm", generator)

pipe.connect("retriever.documents", "ranker.documents")
pipe.connect("ranker", "prompt_builder.documents")
pipe.connect("prompt_builder", "llm")
print("Pipeline created")


Pipeline created


In [10]:
query = "What are the responsibilities of the employees against bribery?"
answer_result = pipe.run({
    "prompt_builder": {
        "query": query
    },
    "retriever": {
        "query": query
    },
    "ranker": {
        "query": query,
        "top_k": 1#to ensure relevance to the og doc
    }
})

answer= answer_result["llm"]["replies"][0].split('Question:')[0].strip()
print(answer)
#improve prompt, and add in some parameters to adjust the query and response generation

Ranking by BM25...: 100%|██████████| 1/1 [00:00<00:00, 999.36 docs/s]
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The employees should not take bribes from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should not accept any bribe from the clients. They should


In [12]:
from sentence_transformers import SentenceTransformer, util

# Load the model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Sample generated and retrieved text
generated_answer = answer
retrieved_text = "Employees are responsible for understanding and complying with this Policy. Be familiar with applicable requirements and directives of the policy and communicate them to subordinates. Promptly record all transactions and payments accurately and in reasonable detail. Always raise suspicious transactions to immediate superiors for guidance on next course of action. Promptly report violations or suspected violations through appropriate channels and Promptly complete COBC trainings and assessments, as well as attest to comply annually"


# Encode the sentences
generated_embedding = model.encode(generated_answer, convert_to_tensor=True)
retrieved_embedding = model.encode(retrieved_text, convert_to_tensor=True)

# Compute cosine similarity
cosine_similarity = util.pytorch_cos_sim(generated_embedding, retrieved_embedding)

print(f"Cosine Similarity: {cosine_similarity.item()}")


Cosine Similarity: 0.21813122928142548


In [None]:
#code to input number of tokens generated from the answer above and split the answers based on sentence by sentence
import re
tokens=re.findall(r'\w+', answer)
num_tokens=len(tokens)

print(f"Number of tokens: {num_tokens}")

In [None]:
import nltk
nltk.download('punkt')

# Split the answer into sentences
sentences = nltk.sent_tokenize(answer)

# List the sentences with an index
for idx, sentence in enumerate(sentences, 1):
    print(f"{idx}. {sentence}")

In [None]:
import os
from gtts import gTTS

# Replace this with the path to your own folder
output_dir = r"C:\Users\imagi\fastRAG\audio"

# Ensure the directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Generate and save TTS for each sentence
for idx, sentence in enumerate(sentences, 1):
    tts = gTTS(text=sentence, lang='en')
    audio_path = os.path.join(output_dir, f"sentence_{idx}.mp3")
    tts.save(audio_path)
    print(f"Saved audio for sentence {idx} at {audio_path}")


In [None]:
#this code here is to reduce the bluriness on the avatar (image)
# import cv2
# avatar_image = cv2.imread(r"C:\Users\imagi\Desktop\videos(avatar)\edited_idle_video-Scene-001-03.jpg")
# sharpen_filter = cv2.filter2D(avatar_image, -1, kernel=cv2.getGaussianKernel(5, -1))
# cv2.imwrite("sharpened_avatar.jpg", sharpen_filter)

In [None]:
import cv2
import threading
import subprocess
import os
import time
from queue import Queue
from pydub import AudioSegment
import simpleaudio as sa

# Paths (adjust as needed)
avatar_image_path = r"C:\Users\imagi\fastRAG\sharpened_avatar.jpg"
wav2lip_repo_path = r"C:\Users\imagi\fastRAG\Wav2Lip"
model_checkpoint_path = os.path.join(wav2lip_repo_path, "checkpoints", "wav2lip.pth")
output_folder = r"C:\Users\imagi\fastRAG\video_avatar"
audio_folder = r"C:\Users\imagi\fastRAG\audio"

class VideoPlayer:
    def __init__(self, idle_video):
        self.queue = Queue()  # Queue to manage videos
        self.idle_video = idle_video
        self.playing = True  # Control playback
        self.lock = threading.Lock()  # Lock to safely update the queue

    def add_video(self, video_path, audio_path=None):
        """
        Add a new video and its corresponding audio to the queue.
        """
        with self.lock:
            self.queue.put((video_path, audio_path))
            print(f"Added video: {video_path} with audio: {audio_path}")

    def play_video(self, video_path, audio_path=None):
        """
        Play a video file and optionally play its audio.
        """
        print(f"Playing video: {video_path}")
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            print(f"Error opening video file: {video_path}")
            return

        # Start audio playback if audio_path is provided
        audio_thread = None
        if audio_path:
            audio_thread = threading.Thread(target=self.play_audio, args=(audio_path,))
            audio_thread.start()

        while cap.isOpened() and self.playing:
            ret, frame = cap.read()
            if not ret:
                break

            cv2.imshow('Video Player', frame)
            if cv2.waitKey(30) & 0xFF == ord('q'):  # Press 'q' to quit
                self.playing = False
                break

        cap.release()
        if audio_thread:
            audio_thread.join()  # Wait for audio playback to finish

    def play_videos(self):
        """
        Continuously play videos, prioritizing queued videos over the idle video.
        """
        while self.playing:
            # Check the queue for videos
            if not self.queue.empty():
                video_path, audio_path = self.queue.get()
                self.play_video(video_path, audio_path)
            else:
                # Play the idle video in a loop until a new video is added
                self.play_video(self.idle_video)

        cv2.destroyAllWindows()

    @staticmethod
    def play_audio(audio_path):
        """
        Play an audio file.
        """
        # Convert MP3 to WAV if necessary
        if audio_path.endswith('.mp3'):
            audio = AudioSegment.from_mp3(audio_path)
            wav_path = audio_path.replace('.mp3', '.wav')
            audio.export(wav_path, format="wav")
            audio_path = wav_path

        # Play the WAV file
        wave_obj = sa.WaveObject.from_wave_file(audio_path)
        play_obj = wave_obj.play()
        play_obj.wait_done()

def process_lip_sync(idx, audio_path, player):
    """
    Generate a lip-sync video for a given audio and add it to the player's queue.
    """
    output_video_path = os.path.join(output_folder, f"output_video_{idx}.mp4")
    output_combined_path = os.path.join(output_folder, f"output_video_{idx}_combined.mp4")


    pads = [0, 10, 0, 0]
    # Run Wav2Lip to generate the lip-sync video
    command = [
        'python', os.path.join(wav2lip_repo_path, 'inference.py'),
        '--checkpoint_path', model_checkpoint_path,
        '--face', avatar_image_path,
        '--audio', audio_path,
        '--outfile', output_video_path,
        '--pads', *map(str, pads)
    ]
    try:
        subprocess.run(command, capture_output=True, text=True, check=True)
        print(f"Lip-sync video generated for sentence {idx}!")

        # Combine the video and audio using ffmpeg
        command_ffmpeg = [
            'ffmpeg', '-i', output_video_path, '-i', audio_path,
            '-c:v', 'libx264', '-c:a', 'aac', '-map', '0:v:0', '-map', '1:a:0', '-y', output_combined_path
        ]
        subprocess.run(command_ffmpeg, capture_output=True, text=True, check=True)
        print(f"Video and audio combined for sentence {idx}!")

        # Add the combined video to the player
        player.add_video(output_combined_path, audio_path)
    except subprocess.CalledProcessError as e:
        print(f"Error generating video for sentence {idx}: {e}")
        print("STDOUT:", e.stdout)
        print("STDERR:", e.stderr)

# Initialize the VideoPlayer with the idle video
idle_video_path = r"C:\Users\imagi\Desktop\videos(avatar)\edited_idle_video.mp4"
player = VideoPlayer(idle_video=idle_video_path)

# Start the video player in a separate thread
player_thread = threading.Thread(target=player.play_videos)
player_thread.start()

# Get all audio files in the audio folder
audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.mp3')]

# Process each audio file in sequence
for idx, audio_file in enumerate(audio_files, start=1):
    audio_path = os.path.join(audio_folder, audio_file)
    if os.path.exists(audio_path):
        process_lip_sync(idx, audio_path, player)

# Wait for the player thread to finish
player_thread.join()
