In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
import random
import pandas as pd

In [None]:
device = "cuda"
# Import texts and embedding df
text_chunks_and_embedding = pd.read_csv("embeddings_CNN.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding["embedding"] = text_chunks_and_embedding["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

In [None]:
text_chunks_and_embedding.head()

In [None]:
text_chunks_and_embedding['sentence_chunk'][3]

In [None]:
# Assuming you've already imported the model and embeddings are on the GPU
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")
def vector_search(query, embeddings, top_k=5):
    query_embedding = embedding_model.encode([query], device="cuda")
    query_embedding_tensor = torch.tensor(query_embedding).to("cuda")
    similarities = torch.nn.functional.cosine_similarity(query_embedding_tensor, embeddings)

    # Get the top_k most similar results
    top_k_indices = similarities.argsort(descending=True)[:top_k]
    top_k_scores = similarities[top_k_indices]

    # Return the top_k results with their indices and similarity scores
    return [(top_k_indices[i].item(), top_k_scores[i].item()) for i in range(top_k)]

In [None]:
# Assuming you've already imported the model and embeddings are on the GPU
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device="cuda")
def vector_search(query, embeddings, top_k=5):
    query_embedding = embedding_model.encode([query], device="cuda")
    query_embedding_tensor = torch.tensor(query_embedding).to("cuda")
    similarities = torch.nn.functional.cosine_similarity(query_embedding_tensor, embeddings)
    # Get the top_k most similar results
    top_k_indices = similarities.argsort(descending=True)[:top_k]
    top_k_scores = similarities[top_k_indices]
    # Return the top_k results with their indices and similarity scores
    return [(top_k_indices[i].item(), top_k_scores[i].item()) for i in range(top_k)]

In [None]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [None]:
def retrieve_fn(query: str,
               embeddings = torch.tensor,
                model: SentenceTransformer = embedding_model,
                n_resources_to_return: int = 5):
    query_embedding = model.encode(query,
                                  convert_to_tensor=True
                                  )
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    scores, indices = torch.topk(input=dot_scores,
                                 k = n_resources_to_return)
    return scores, indices
def print_top_score(query: str,
                    embeddings: torch.tensor,
                    pages_and_chunks: list[dict] = pages_and_chunks,
                    n_resources_to_return: int = 1) -> str:

    # Retrieve scores and indices (limit to 1 resource to return)
    scores, indices = retrieve_fn(query=query,
                                  embeddings=embeddings,
                                  n_resources_to_return=n_resources_to_return)
    
    # Extract the top text chunk based on the first index
    top_chunk = pages_and_chunks[int(indices[0])]["sentence_chunk"]
    
    # Print the retrieved top chunk
    print(f"Query: '{query}'\n")
    # Return the topmost chunk
    return top_chunk

In [None]:
query_1 = "what happened in Capital hall"
scores, indices = retrieve_fn(query=query_1,
                                    embeddings=embeddings)
scores, indices

In [None]:
# Print out the texts of the top scores
print_top_score(query=query_1,
                             embeddings=embeddings)

In [None]:
# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

In [None]:
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
Gamma_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16,
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False,
                                                 attn_implementation=attn_implementation) 
if not use_quantization_config:
    Gamma_model.to("cuda")

In [None]:
Gamma_model

In [None]:
def prompt_formatter(query: str, context_items: list[dict]) -> str:
    """
    Formats a query with context items and styles the prompt for Gamma 2B.
    """
    # Combine context items into a single readable string
    context = "\n- ".join([item["sentence_chunk"] for item in context_items])
    
    # Base prompt with examples of the desired style and tone
    base_prompt = f"""Based on the following context items, answer the query in a tone that resonates with Gen Z.
Keep it fresh, conversational, and culturally relevant. Use memes, humor, or slang where it fits, but stay accurate.

Context:
- {context}
    
Examples for the tone:
Example 1:
User: What’s going on with the latest climate change report?
Gamma 2B: Okay, so here’s the deal: the new climate report is throwing major shade at humanity. TL;DR—things are heating up, literally. Greenhouse gases? Still through the roof. Polar ice caps? Melting faster than ice cream in July. The report’s basically yelling, “Do something!” So, yeah, less single-use plastics and more renewable energy, please. 🌍  

Example 2:
User: What’s the latest update on the election results?
Gamma 2B: Alright, election updates are in, and it’s a rollercoaster. The votes are being counted like it’s a TikTok trend—slow but steady. Key states are swinging, but it’s still anyone’s game. Keep your snacks close and your notifications on, ‘cause it’s about to get spicy. 🗳️  

Example 3:
User: Why is everyone talking about the new tech layoffs?
Gamma 2B: Oof, big yikes for the tech world right now. Companies like Meta and Google have been downsizing—like, thousands of jobs gone in a blink. They’re blaming “economic uncertainty,” but let’s be real, it’s probs also about squeezing more profit. If you’re in tech, it’s giving *stay-ready-so-you-don’t-have-to-get-ready* vibes.  

Now, answer the query below:
User: {query}
Gamma 2B:"""
    
    return base_prompt

In [None]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Generates an answer to a query by retrieving context items, formatting the prompt, 
    and calling Gamma 2B to generate a response.
    """
    # Retrieve context items
    scores, indices = retrieve_fn(query=query, embeddings=embeddings)
    context_items = [pages_and_chunks[i] for i in indices]

    # Attach scores to context items
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()  # Return score to CPU
    
    # Format the prompt using Gamma 2B requirements
    prompt = prompt_formatter(query=query, context_items=context_items)
    
    # Prepare the input for Gamma 2B
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generate output
    outputs = Gamma_model.generate(
        **input_ids,
        temperature=temperature,
        do_sample=True,
        max_new_tokens=max_new_tokens
    )

    # Decode the output
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Format the answer text if requested
    if format_answer_text:
        output_text = (
            output_text
            .replace(prompt, "")
            .strip()
        )
    
    # Return only the answer or both the answer and context
    if return_answer_only:
        return output_text
    return output_text, context_items

In [None]:
query = "What is Nigeria's Capital?"
print(f"Query: {query}")

# Answer the query with context and return context items
answer, context_items = ask(
    query=query, 
    temperature=0.7,
    max_new_tokens=512,
    return_answer_only=False
)

# Print the answer
print(f"Answer:\n{answer}")
for item in context_items:
    print(f"- {item['sentence_chunk']} (Score: {item['score']:.2f})")

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import os
from scipy.io.wavfile import write

# Initialize the TTS models and processor globally
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to("cuda")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to("cuda")

In [None]:
def split_text(text, max_length=300):
    """
    Splits text into chunks of max_length characters without cutting sentences.
    """
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split text by sentence-ending punctuation
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
def text_to_speech(text, speaker_embedding, output_path):
    """
    Convert text to speech using Microsoft SpeechT5, handle long text, and smooth audio transitions.
    """
    max_chunk_length = 300  # Limit for chunking text
    chunks = split_text(text, max_length=max_chunk_length)
    audio_chunks = []

    print(f"Splitting text into {len(chunks)} chunks for processing...")

    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i + 1}/{len(chunks)}: {chunk}")

        # Preprocess the input chunk
        inputs = processor(text=chunk, return_tensors="pt").to("cuda")

        # Generate the speech audio
        with torch.no_grad():
            speech = tts_model.generate_speech(
                inputs["input_ids"], 
                speaker_embeddings=speaker_embedding, 
                vocoder=vocoder
            )

        # Convert the speech tensor to a numpy array
        speech_numpy = speech.cpu().numpy()
        audio_chunks.append(speech_numpy)

    # Concatenate all audio chunks
    final_audio = np.concatenate(audio_chunks)

    # Save the concatenated audio as a .wav file
    write(output_path, 16000, final_audio)  # 16kHz sample rate
    print(f"Final audio saved to {output_path}")


In [None]:
def generate_response(query):
    """
    Query Gamma 2B to generate a response and retrieve the top-1 context.
    """
    # Generate response and retrieve context items
    print("Generating response from Gamma 2B...")
    answer, context_items = ask(query=query, return_answer_only=False)

    # Extract the top-1 context
    top_context = context_items[0] if context_items else {"sentence_chunk": "No context available."}

    # Print the answer and top context
    print("\nAnswer:")
    print(answer)
    print("\nTop-1 Context:")
    print(f"- {top_context['sentence_chunk']} (Score: {top_context.get('score', 0):.2f})")

    # Return the response and top-1 context
    return answer, top_context


In [None]:
def main_loop():
    """
    Main loop for querying Gamma 2B and converting its output to speech.
    Only the Gamma model's output is passed to TTS, excluding the context.
    """
    output_folder = "/home/shegun93/TTS"
    os.makedirs(output_folder, exist_ok=True)

    while True:
        # Ask the user for a query
        query_text = input("Enter your query (or type 'exit' to quit): ")
        if query_text.lower() == 'exit':
            print("Exiting the program. Goodbye!")
            break

        # Generate the response from Gamma 2B
        answer, _ = generate_response(query=query_text)  # Ignore the context

        # Generate a filename for the audio file
        audio_filename = os.path.join(output_folder, "gamma_response_1.wav")

        # Convert the answer to speech and save the audio
        print("\nSaving the Gamma model's output as speech:")
        text_to_speech(answer, speaker_embedding, audio_filename)
if __name__ == "__main__":
    main_loop()