# **Research**



## **Input parsing:**



### **Step 1: Audio Transcription**

1.   Installing the `faster_whisper` Library
2.   Importing Necessary Libraries
3.   Determining the Computing Device
4.   Selecting the Model Size and Compute Type
5.   Loading the Whisper Model
6.   Transcribing the Audio File

In [None]:
pip install faster_whisper

In [None]:
import torch
from faster_whisper import WhisperModel


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model_size = "small"

if device == "cpu":
    compute_type = "int8"
else:
    compute_type = "float16"


model = WhisperModel(model_size, device=device, compute_type=compute_type)

audio_file = './america.wav'

segments, info = model.transcribe(audio_file, beam_size=1)

transcription = ""
for segment in segments:
    transcription += segment.text


### Documents Parsing

Install Required Packages

In [None]:
!sudo apt-get update
!sudo apt-get install -y tesseract-ocr
!sudo apt-get install -y tesseract-ocr-heb
# Install Python packages
!pip install pytesseract
!pip install langdetect
!pip install Pillow
!pip install pandas
!pip install PyPDF2
!pip install python-pptx
!pip install python-docx
!pip install pdfminer.six

# Verify installed languages
!tesseract --list-langs

Import LIB

In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import os
import glob
from google.colab import drive
import re
from langdetect import detect_langs, DetectorFactory
from pptx import Presentation
from docx import Document
from pdfminer.high_level import extract_text

taking all the languages that we need to work in, lang_map

In [None]:
# Ensure consistent language detection
DetectorFactory.seed = 0

# Extended language map (detect to Tesseract)
lang_map = {
    'en': 'eng',    # English
    'es': 'spa',    # Spanish
    'fr': 'fra',    # French
    'de': 'deu',    # German
    'it': 'ita',    # Italian
    'pt': 'por',    # Portuguese
    'ru': 'rus',    # Russian
    'zh-cn': 'chi_sim',  # Simplified Chinese
    'zh-tw': 'chi_tra',  # Traditional Chinese
    'ja': 'jpn',    # Japanese
    'ko': 'kor',    # Korean
    'ar': 'ara',    # Arabic
    'he': 'heb',    # Hebrew
    'fa': 'fas',    # Persian (Farsi)
    'hi': 'hin',    # Hindi
    'th': 'tha',    # Thai
    'vi': 'vie',    # Vietnamese
    'nl': 'nld',    # Dutch
    'tr': 'tur',    # Turkish
    'pl': 'pol',    # Polish
    'uk': 'ukr',    # Ukrainian
    'ro': 'ron',    # Romanian
    'bg': 'bul',    # Bulgarian
    'el': 'ell',    # Greek
    'ur': 'urd',    # Urdu
    # Add more languages as needed
}



Extract txt from txt, PDF, docs, pptx files

In [None]:
def read_txt_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            lines = [line.rstrip('\n') for line in file]
        return lines
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except IOError as e:
        print(f"An I/O error occurred: {e}")
    return []

def read_pdf_with_pdfminer(file_path):
    try:
        text = extract_text(file_path)
        lines = text.splitlines()
        return lines
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

def read_pptx_file(file_path):
    try:
        prs = Presentation(file_path)
        text_runs = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_runs.append(shape.text)
        return text_runs
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

def read_docx_file(file_path):
    try:
        doc = Document(file_path)
        text = []
        for para in doc.paragraphs:
            text.append(para.text)
        return text
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

Extract txt from image

In [None]:
def preprocess_image(image_path):
    """
    Preprocesses an image to improve OCR accuracy.

    Steps:
    - Convert to grayscale
    - Apply median filter for noise reduction
    - Enhance contrast
    - Binarize the image using thresholding
    """
    try:
        image = Image.open(image_path).convert('L')  # Convert to grayscale
        image = image.filter(ImageFilter.MedianFilter())  # Reduce noise
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(2)  # Enhance contrast
        image = image.point(lambda x: 0 if x < 140 else 255, '1')  # Binarization
        return image
    except Exception as e:
        print(f"Error preprocessing {image_path}: {e}")
        return None


def extract_text_multiple_languages(image, languages=['eng', 'heb']):
    """
    Extracts text from an image using Tesseract OCR with multiple languages.

    Args:
        image (PIL.Image): Preprocessed image.
        languages (list): List of language codes to use for OCR.

    Returns:
        str: Extracted text.
    """
    try:
        # Join language codes with '+' for Tesseract
        lang_param = '+'.join(languages)
        text = pytesseract.image_to_string(image, lang=lang_param)
        return text.strip()
    except Exception as e:
        print(f"Error during OCR with multiple languages: {e}")
        return ""



def further_clean_text(text):
    """
    Further cleans the extracted text by removing unwanted characters.

    Args:
        text (str): Extracted text.

    Returns:
        str: Cleaned text.
    """
    # Remove non-printable characters
    text = ''.join(filter(lambda x: x.isprintable(), text))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text




# Supported image formats
image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.tiff', '*.bmp']

# Gather all image file paths
image_paths = ['/content/Screenshot 2024-09-16 at 11.59.44.png']


print(f"Found {len(image_paths)} images.")




Define what the file type is to extraxt the txt from it

In [None]:
# Initialize lists to store results
extracted_text = []
detected_file_types = []
file_names = []

# Define desired languages for OCR
desired_languages = ['eng', 'heb', 'spa']  # Add more as needed

def extract_text_from_file(file_path, use_pdfminer=False):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.txt':
        text = read_txt_file(file_path)
        return text, "txt"

    elif file_extension == '.pdf':
        if use_pdfminer:
            text = read_pdf_with_pdfminer(file_path)
        else:
            # If you have another PDF reader, integrate it here
            text = read_pdf_with_pdfminer(file_path)
        return text, "pdf"

    elif file_extension == '.pptx':
        text = read_pptx_file(file_path)
        return text, "pptx"

    elif file_extension == '.docx':
        text = read_docx_file(file_path)
        return text, "docx"

    elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        # Process image with OCR
        preprocessed_img = preprocess_image(file_path)
        if preprocessed_img is None:
            return "", "preprocessing_failed"
        text = extract_text_multiple_languages(preprocessed_img, languages=desired_languages)
        return text, "image"

    else:
        print(f"Unsupported file type: {file_extension}")
        return "", "unsupported"


The main part taking the file path save it on uploaded list and extract all the files from there

In [None]:
# Your list of files to process
uploaded = ['/content/Screenshot 2024-09-16 at 11.59.44.png','/content/תשובות.docx','/content/Full Stack Project.pdf']

# Process all uploaded files
for idx, filename in enumerate(uploaded):
    print(f"Processing File {idx + 1}/{len(uploaded)}: {filename}")
    try:
        text, file_type = extract_text_from_file(filename, use_pdfminer=True)
        if text:
            # For text files, text may be a list of lines; for images, text is a string
            if isinstance(text, list):
                cleaned_text = "\n".join([further_clean_text(line) for line in text])
            else:
                cleaned_text = further_clean_text(text)
            extracted_text.append(cleaned_text)
            detected_file_types.append(file_type)
            file_names.append(os.path.basename(filename))

            # Print a preview of the extracted text
            preview = cleaned_text[:100].replace('\n', ' ') + ('...' if len(cleaned_text) > 100 else '')
            print(cleaned_text)

            print(f"Extracted Text Preview: {preview}")
        else:
            print("No text extracted.")
            extracted_text.append("")
            detected_file_types.append("no_text_extracted")
            file_names.append(os.path.basename(filename))
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        extracted_text.append("")
        detected_file_types.append("error")
        file_names.append(os.path.basename(filename))
    print("-" * 50)


## **Retrieval-Augmented Generation (RAG)**

This script is designed to identify and retrieve the most relevant documents from a predefined corpus based on a user-provided query. It leverages the power of Sentence Transformers to convert text into meaningful vector embeddings and uses cosine similarity from scikit-learn to measure the semantic similarity between the query and the documents. Additionally, the script includes multilingual support by incorporating documents in Hebrew.

### **Step 1: Text Embeddings**

1.   Installing Required Libraries

In [None]:
pip install sentence_transformers scikit-learn transformers torch



2.    Importing Necessary Libraries

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

  from tqdm.autonotebook import tqdm, trange


3.   Loading the Sentence Transformer Model

In [None]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


4. Defining the Corpus of data

In [None]:
data =[ """
    Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose,
    treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical
    images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has
    reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered
    robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make
    more informed decisions. Despite these advances, there are challenges such as data privacy and the need
    for comprehensive validation of AI models before widespread adoption.
    """,
    """
    Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure
    that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in
    on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them
    to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and
    collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns
    about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to
    ensure long-term sustainability.
    """,
    """
    Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their
    inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in
    the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made
    it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural
    language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes,
    where each node represents a neuron. The training of neural networks involves adjusting weights based on input
    data, allowing the model to learn patterns and make predictions. Despite their success, training large neural
    networks requires significant computational power and data.
    """,
    """
    As the world faces the growing threat of climate change, sustainable energy has become a major focus of global
    efforts. Renewable energy sources such as solar, wind, and hydropower are being developed to reduce dependence
    on fossil fuels. Clean technologies are playing a critical role in achieving sustainability goals, with innovations
    in energy storage, electric vehicles, and smart grids leading the way. Governments and private companies alike are
    investing heavily in research and development to create more efficient and cost-effective solutions. While the
    transition to sustainable energy presents challenges, including the initial cost of infrastructure and the need for
    reliable energy storage, it also offers immense benefits in terms of reducing greenhouse gas emissions and creating
    new economic opportunities.
    """,
     """
    מאמר 1: השפעת הבינה המלאכותית על תהליכים משפטיים
בעשור האחרון, הבינה המלאכותית (AI) הפכה להיות כלי מרכזי בתהליכים משפטיים, בין אם במתן ייעוץ משפטי אוטומטי ובין אם בניתוח מסמכים משפטיים. אחד היתרונות המרכזיים של AI הוא היכולת לנתח כמויות גדולות של מידע במהירות וביעילות. מערכות בינה מלאכותית מסוגלות לעבד חומרים משפטיים מורכבים, כגון חוזים, תיקים משפטיים, ופסיקות עבר, ולספק תחזיות על בסיס נתונים היסטוריים. שימוש ב-AI מבטיח יעילות משופרת, חיסכון בזמן והפחתת טעויות אנוש בתהליכי קבלת החלטות. עם זאת, השימוש הגובר ב-AI מעלה שאלות אתיות ומשפטיות רבות. לדוגמה, נושאים של אחריות משפטית: האם ניתן לסמוך על מערכת אוטומטית בקבלת החלטות משפטיות? שאלות אלו מתחדדות בעיקר בתחום הפלילי, כאשר טכנולוגיות אלו משמשות ככלי לסיוע בקביעת עונשים או החלטות שיפוטיות אחרות. אחת מהשאלות המרכזיות הנוגעות ל-AI בתחום המשפט היא האם ניתן להחליט על סוגיות משפטיות בהתבסס על ניתוח של אלגוריתם בלבד, או שיש צורך באישוש ההחלטות על ידי בני אדם. כמו כן, בעידן של רגולציה טכנולוגית מוגברת, חשוב לשאול האם המערכות הללו עומדות בקריטריונים של שקיפות והוגנות, והאם ישנם מנגנונים שימנעו מהן לבצע הטיות סמויות בהחלטות שהן מקבלות. יתר על כן, השימוש ב-AI בחוזים מסחריים הפך להיות שכיח ביותר, כאשר מערכות אוטומטיות יכולות לסייע בניתוח וכתיבה של חוזים. עם זאת, יש צורך בהתאמת החקיקה כדי להבטיח שהחוזים שנוצרים על ידי מערכות אלו יעמדו בדרישות החוק. בתי המשפט נדרשים להתמודד עם פסיקות הנוגעות לחוזים שנוצרו או נחתמו באמצעות AI, וחקיקה שתסדיר נושא זה עדיין מתהווה.
      """,
      """
    מאמר 2: משפט בינלאומי וזכויות אדם בעידן הגלובליזציה
    המשפט הבינלאומי מתמודד כיום עם אתגרים חדשים הנובעים מהתהליך המואץ של גלובליזציה והשפעתה על זכויות אדם. בעידן של תנועת אנשים, הון, וסחורות בצורה חסרת תקדים, יש צורך בהסדרים משפטיים בינלאומיים המבטיחים הגנה על זכויות האדם בכל המדינות. אמנות בינלאומיות, כגון האמנה הבינלאומית לזכויות האדם, נועדו להבטיח שכל אדם יקבל את ההגנות הנדרשות גם כאשר הוא חוצה גבולות. עם זאת, במקרים רבים נתקלים במצבים בהם יש פער בין הרטוריקה הבינלאומית לזכויות האדם לבין היישום המעשי שלהן. לדוגמה, נושא של פליטים ומהגרים הוא נושא קריטי במשפט הבינלאומי. בעוד שהאמנה לזכויות האדם מבטיחה הגנה לכל אדם, המציאות היא שמדינות רבות מוצאות דרכים לעקוף את ההתחייבויות הבינלאומיות שלהן בנוגע לפליטים, בשל שיקולים כלכליים ופוליטיים. סוגיה נוספת שעולה היא השימוש בטכנולוגיות חדשות בתחום הביטחון והמעקב, כמו מערכות זיהוי פנים ואלגוריתמים למעקב אחר תנועות אנשים. בעוד שהשימוש בטכנולוגיות אלו נועד להגביר את הביטחון, הוא מעלה שאלות אתיות ומשפטיות בנוגע לפגיעה בפרטיות וזכויות אדם. יתר על כן, גם בתחום הכלכלה הגלובלית, נדרשת התייחסות משפטית מיוחדת לנושאים של צדק חברתי ושוויון. תאגידים בינלאומיים, שפועלים במספר מדינות, מתמודדים עם בעיות משפטיות מורכבות הקשורות לזכויות עובדים, הגנה על הסביבה, ומניעת תחרות לא הוגנת. המשפט הבינלאומי נדרש להסדיר תחומים אלו ולהבטיח שהגלובליזציה לא תפגע בזכויות האדם הבסיסיות.
    """,
      """
    מאמר 3: הגנת הפרטיות בעידן הדיגיטלי
    הגנת הפרטיות הפכה לאחד הנושאים המשפטיים החשובים ביותר בעידן הדיגיטלי. הטכנולוגיות החדשות, ובמיוחד השימוש באינטרנט ובמדיה החברתית, יצרו מצבים חדשים בהם מידע אישי זורם בצורה מהירה וחסרת גבולות, ולעיתים רבות ללא ידיעת המשתמשים. חברות ענק כמו פייסבוק, גוגל ואמזון אוספות כמויות עצומות של מידע על המשתמשים שלהן, החל מהרגלי גלישה ועד להעדפות צרכניות אישיות. המידע שנאסף מנוצל לעיתים לצרכים מסחריים, כגון פרסום ממוקד, אך הוא גם עשוי להגיע לידי צדדים שלישיים או ממשלות, מה שמעלה את החשש לפגיעה בפרטיות האישית. בעולם בו המידע האישי הפך לנכס כלכלי חשוב, המשפט נדרש לספק כלים חדשים שיבטיחו את ההגנה על פרטיות המשתמשים. אחת הדוגמאות המרכזיות היא החקיקה האירופית בנושא הגנת הפרטיות – ה-GDPR (General Data Protection Regulation). חקיקה זו מעניקה לאזרחים באיחוד האירופי זכויות נרחבות בנוגע לאופן בו המידע האישי שלהם נאסף ומנוהל, כולל הזכות למחוק מידע ("הזכות להישכח") והזכות לקבל מידע על מי משתמש במידע האישי שלהם. חוקים דומים נחקקו במדינות נוספות, אולם במקומות רבים בעולם החקיקה בנושא פרטיות עדיין נמצאת בפיגור לעומת התפתחות הטכנולוגיה. נוסף על כך, קיימת שאלה משפטית מעניינת הנוגעת לאחריותן של החברות שמחזיקות במידע. כאשר מתרחשות פריצות אבטחה או דליפות מידע, נדרשת אחריות משפטית מצד החברות, אולם לעיתים קרובות הפיצוי למשתמשים אינו מספק. חקיקה נוספת נדרשת כדי להבטיח שהזכויות הדיגיטליות של המשתמשים יישמרו.
    """,
      """
    מאמר 4: חוזים חכמים וטכנולוגיית הבלוקצ'יין
    טכנולוגיית הבלוקצ'יין יצרה מהפכה בשוק הכלכלי והמשפטי, בעיקר באמצעות החוזים החכמים. בלוקצ'יין הוא פרוטוקול מבוזר המאפשר רישום ואימות של עסקאות בצורה מאובטחת וללא צורך בגורם מתווך. החוזים החכמים, הפועלים על גבי רשתות הבלוקצ'יין, הם תוכנות המתבצעות באופן אוטומטי כאשר מתקיימים תנאים מוסכמים בין הצדדים. כלומר, מדובר בחוזים דיגיטליים שיכולים לאכוף את עצמם. לדוגמה, עסקה בין שני צדדים תתבצע אוטומטית כאשר יתקיימו כל התנאים שנקבעו בחוזה החכם, ללא צורך בהתערבות מצד שלישי כמו עורך דין או נוטריון. החוזים החכמים מהווים שינוי תפיסתי עמוק בעולם המשפט, שכן הם מייתרים את הצורך בתהליכי אכיפה מסורתיים. עם זאת, השימוש בחוזים חכמים מעלה שאלות משפטיות רבות. אחת מהשאלות המרכזיות היא מה קורה במצב של טעות או הונאה בחוזה חכם. מאחר והחוזה מתבצע באופן אוטומטי, לא תמיד יש אפשרות לבטל או לשנות את התנאים. כמו כן, קיימת השאלה האם חוזים חכמים עומדים בדרישות החוק הקיימות בכל הנוגע לכשירות חוזית. תחום נוסף הדורש בחינה משפטית הוא האבטחה של רשתות הבלוקצ'יין עצמן. למרות שהבלוקצ'יין נחשב לטכנולוגיה מאובטחת, היו מקרים בהם התרחשה פריצה לרשתות אלו, מה שהוביל לאובדן כספים ונתונים. נושא האבטחה ימשיך להיות נושא מרכזי בעידן הבלוקצ'יין, וככל שיותר גופים מאמצים טכנולוגיה זו, יידרש פיקוח רגולטורי מתאים. בנוסף, יש לבחון את סוגיות המסים הנוגעות לעסקאות המתבצעות באמצעות בלוקצ'יין, שכן חוזים חכמים מציבים אתגרים חדשים במיסוי ובהגדרה המשפטית של הרווחים הנובעים מהם.
    """

]

In [None]:
import json

with open('database.json', 'r', encoding='utf-8') as file:
    json_data = json.load(file)

In [None]:

data = [item['data'] for item in json_data]

### **Step 2: Vector Database**

1. Generating Embeddings for All Texts

In [63]:
sbert_embeddings = sbert_model.encode(data, convert_to_numpy=True)


[[ 0.00826032  0.08355446  0.01919199 ...  0.01981263 -0.01772506
  -0.01057865]
 [ 0.00197358  0.14719911  0.0248766  ...  0.03754839  0.0019132
  -0.05493992]
 [ 0.01169664  0.05381443 -0.00044901 ...  0.00543916 -0.07542367
  -0.04113217]]


2. Defining the Similarity Matching Function

In [None]:
def find_best_match(query, texts, sbert_model, sbert_embeddings, top_k=2):
    """
    Finds the best matching text(s) for the given query based on cosine similarity.

    Parameters:
    - query (str): The user query string.
    - texts (List[str]): List of texts to compare against.
    - sbert_model (SentenceTransformer): The Sentence Transformers model.
    - sbert_embeddings (np.ndarray): Precomputed embeddings of the texts.
    - top_k (int): Number of top matches to return.

    Returns:
    - List of tuples containing (text, similarity_score)
    """
    query_embedding = sbert_model.encode([query], convert_to_numpy=True)

    similarities = cosine_similarity(query_embedding, sbert_embeddings)[0]

    top_indices = similarities.argsort()[-top_k:][::-1]

    best_matches = [(texts[idx], similarities[idx]) for idx in top_indices]

    return best_matches

3. testing the model with query

In [None]:
query = "מה ליאור סילברמן אוהב לאכול?"

# Find the best match
best_match = find_best_match(query, data, sbert_model, sbert_embeddings, top_k=5)
best_match_text = best_match[0][0]

print(best_match_text)

# Display the result
print("Best Match:")
for text, score in best_match:
    print(f"Text: {text}\nSimilarity Score: {score:.4f}\n")

ליאור סילברמן מאד אוהב לאכול שניצלים
Best Match:
Text: ליאור סילברמן מאד אוהב לאכול שניצלים
Similarity Score: 0.7990

Text: ליאור סילברמן אוהב מאד פיצה
Similarity Score: 0.7983

Text:  עבור יוסי האוכל בבית מאד טעים
Similarity Score: 0.5549



## Try gpt2 LLM


In [None]:
# Load GPT-2 model and tokenizer
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2')

# # Function to generate a response
# def generate_response(text, gpt2_model, gpt2_tokenizer):
#     input_ids = gpt2_tokenizer.encode(text, return_tensors='pt')
#     output = gpt2_model.generate(input_ids, max_length=150, num_return_sequences=1)
#     generated_text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
#     return generated_text



In [None]:
def generate_streaming_response(text, gpt2_model, gpt2_tokenizer, max_new_tokens=None):
    # Tokenize the input text with truncation and padding
    inputs = gpt2_tokenizer(
        text,
        return_tensors='pt',
        truncation=True,
        max_length=1024,
        padding=True
    )

    # Initialize the input_ids with the tokenized input
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # List to store the generated tokens
    generated_tokens = []

    # Generate tokens in a loop
    for _ in range(max_new_tokens or 1024):  # Set an arbitrary large limit
        # Generate the next token
        outputs = gpt2_model(input_ids=input_ids, attention_mask=attention_mask)
        next_token_logits = outputs.logits[:, -1, :]
        next_token_id = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1)

        # Append the new token to the list of generated tokens
        generated_tokens.append(next_token_id.item())

        # Break the loop if EOS token is generated
        if next_token_id.item() == gpt2_tokenizer.eos_token_id:
            break

        # Concatenate the new token to the input_ids
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
        attention_mask = torch.cat(
            [attention_mask, torch.ones_like(next_token_id)], dim=-1
        )

        # Decode and print the new token
        generated_text = gpt2_tokenizer.decode(next_token_id[0], skip_special_tokens=True)
        print(generated_text, end='', flush=True)

    print()  # Newline at the end

    # Decode the full generated text
    full_generated_text = gpt2_tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return full_generated_text

# Example usage
generated_response = generate_streaming_response(best_match_text, gpt2_model, gpt2_tokenizer)
print("\nFull Generated Response:\n", generated_response)



 ������ ��ל ������ ��ל ������ ��ל ������ ��ל ������ ��ל ������ ��ל ������ ��

KeyboardInterrupt: 

## Try GenAi

In [None]:
pip install -q -U google-generativeai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/165.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m163.8/165.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m165.0/165.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/725.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m725.4/725.4 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import google.generativeai as genai
from google.colab import userdata

In [None]:
genai.configure(api_key=userdata.get('API_KEY'))

# Construct the prompt
prompt = f"""
Generate a comprehensive response based on the following:

**Query:** {query}

**Most Similar SBERT Answer Embedding:** {best_match}

**Response Format:** Informative and concise.
"""

# Generate the response
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content(prompt)

# response = genai.GenerativeModel("gemini-1.5-pro-001").generateContent(
#     prompt=prompt
# )

print(response.text)

ליאור סילברמן כנראה מאוד אוהב שניצלים ופיצה. 



### load BERT and Tokenizer - option 1

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

extract embeddings from BERT using the [CLS] token, which will serve as the sentence embedding.

In [None]:
# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)

    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding


compute the similarity between different queries using cosine similarity to show how BERT handles semantic similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Get embeddings for similar and different queries
query1 = "Explain the role of backpropagation in deep learning."
query2 = "How do convolutional neural networks work?"
query3 = "What is the process to register for courses at the university?"

embedding1 = get_cls_embedding(query1)
embedding2 = get_cls_embedding(query2)
embedding3 = get_cls_embedding(query3)

# Compute cosine similarities
similarity_1_2 = cosine_similarity(embedding1, embedding2)
similarity_1_3 = cosine_similarity(embedding1, embedding3)

print(f"Similarity between query 1 and query 2: {similarity_1_2[0][0]}")
print(f"Similarity between query 1 and query 3: {similarity_1_3[0][0]}")


Similarity between query 1 and query 2: 0.8579674363136292
Similarity between query 1 and query 3: 0.8106188774108887


text retrevial using BERT we embedd the documents with a cls token which respresents the sentence and we then do a similarity check between the query .
since the query is about backpropagtion document 1 and 3 should have a higher similarity than doc 2 which is about vpb

In [None]:
# Example documents
doc1 = "Neural networks are computing systems inspired by the biological neural networks."
doc2 = "To connect to the university VPN, you need to configure your VPN client."
doc3 = "Backpropagation is a fundamental algorithm in training deep learning models."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How does backpropagation work in neural networks?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


Similarity with Document 1: 0.7179515361785889
Similarity with Document 2: 0.5887855887413025
Similarity with Document 3: 0.6379861831665039


In [None]:
# Example documents
doc1 = "Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose, treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make more informed decisions. Despite these advances, there are challenges such as data privacy and the need for comprehensive validation of AI models before widespread adoption."
doc2 = "Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to ensure long-term sustainability."
doc3 = "Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes, where each node represents a neuron. The training of neural networks involves adjusting weights based on input data, allowing the model to learn patterns and make predictions. Despite their success, training large neural networks requires significant computational power and data."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


as you can see the results are as expected

In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)

    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding

# Example documents (longer text)
doc1 = """
Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose,
treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical
images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has
reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered
robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make
more informed decisions. Despite these advances, there are challenges such as data privacy and the need
for comprehensive validation of AI models before widespread adoption.
"""

doc2 = """
Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure
that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in
on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them
to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and
collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns
about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to
ensure long-term sustainability.
"""

doc3 = """
Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their
inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in
the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made
it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural
language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes,
where each node represents a neuron. The training of neural networks involves adjusting weights based on input
data, allowing the model to learn patterns and make predictions. Despite their success, training large neural
networks requires significant computational power and data.
"""

doc4 = """
As the world faces the growing threat of climate change, sustainable energy has become a major focus of global
efforts. Renewable energy sources such as solar, wind, and hydropower are being developed to reduce dependence
on fossil fuels. Clean technologies are playing a critical role in achieving sustainability goals, with innovations
in energy storage, electric vehicles, and smart grids leading the way. Governments and private companies alike are
investing heavily in research and development to create more efficient and cost-effective solutions. While the
transition to sustainable energy presents challenges, including the initial cost of infrastructure and the need for
reliable energy storage, it also offers immense benefits in terms of reducing greenhouse gas emissions and creating
new economic opportunities.
"""

# Query to compare with the documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

# Get embeddings for documents and query
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)
doc_embedding4 = get_cls_embedding(doc4)
query_embedding = get_cls_embedding(query)

# Compute cosine similarities between the query and each document
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)
similarity_doc4 = cosine_similarity(query_embedding, doc_embedding4)

# Show similarity results
print(f"Similarity with Document 1 (AI in Healthcare): {similarity_doc1[0][0]}")
print(f"Similarity with Document 2 (Cloud Computing): {similarity_doc2[0][0]}")
print(f"Similarity with Document 3 (Neural Networks in AI): {similarity_doc3[0][0]}")
print(f"Similarity with Document 4 (Sustainable Energy): {similarity_doc4[0][0]}")




Similarity with Document 1 (AI in Healthcare): 0.7179515361785889
Similarity with Document 2 (Cloud Computing): 0.5887855887413025
Similarity with Document 3 (Neural Networks in AI): 0.6379861831665039
Similarity with Document 4 (Sustainable Energy): 0.6569727659225464


the issue with BERT was he computed based on the word and which had the most occurences

# Step 3: Inserting Data

In this step, we worked on inserting transcriptions into a **JSON file** called `data.json` with the following format:

```json
[
  {"data": "Transcription 1..."},
  {"data": "Transcription 2..."},
  {"data": "Transcription 3..."}
]

 ### Initialize Whisper Model
* We used the `faster_whisper` model to transcribe audio files into text. The transcriptions are done on a chosen device (`CPU` or `CUDA`)    depending on availability.

 **Transcribing Audio**:
   We created a function `transcribe_audio()` that takes an audio file, processes it with Whisper, and returns the transcribed text.

   
**Handling the `data.json` File**:

   * We first checked if the `data.json` file exists and is not empty. If the file is empty or does not exist, we initialized it with an empty list (`[]`).

   * If the file exists, we loaded its content using `json.load()`.  We ensured that the content is a list; otherwise, we reinitialized it as an empty list.




In [2]:
import torch
import json
from faster_whisper import WhisperModel
import os

# Initialize the device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model size and compute type based on device
model_size = "small"
compute_type = "int8" if device == "cpu" else "float16"

# Load the Whisper model
model = WhisperModel(model_size, device=device, compute_type=compute_type)

# Transcribe a new audio file
def transcribe_audio(audio_file):
    segments, info = model.transcribe(audio_file, beam_size=1)

    transcription = ""
    for segment in segments:
        transcription += segment.text

    return transcription

# Path to the existing JSON file
json_file_path = 'data.json'

# Check if the JSON file exists and has content
if os.path.exists(json_file_path) and os.path.getsize(json_file_path) > 0:
    # Load existing data from the JSON file
    with open(json_file_path, 'r') as json_file:
        try:
            json_data = json.load(json_file)
            # Ensure json_data is a list, if not initialize it
            if not isinstance(json_data, list):
                json_data = []
        except json.JSONDecodeError:
            # If the JSON file is corrupted, initialize it as an empty list
            json_data = []
else:
    # If the file doesn't exist or is empty, create a new list
    json_data = []

# Transcribe an audio file and append the transcription
audio_file = 'test1.mp3'
transcription = transcribe_audio(audio_file)

# Append the new transcription as a dictionary with the key "data"
json_data.append({"data": transcription})

# Write the updated data back to the JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(json_data, json_file, indent=4, ensure_ascii=False)

# Print success message
print(f"Transcription added to {json_file_path}!")


Using device: cpu
Transcription added to data.json!
