load BERT and Tokenizer

# Video Processing: Object Detection and Audio Transcription

In this notebook, we will process several video clips to:

- Perform object detection using different models.
- Transcribe audio using different transcription tools.
- Measure performance metrics such as processing speed, detection accuracy, and transcription quality.
- Compare the performance of different tools.

**Tools Used:**

- **Object Detection Models**:
  - YOLOv8 (Ultralytics)
  - YOLOv5 (Ultralytics) 
  - BLIP

Install necessary packages for document parsing, YOLO object detection, BLIP and data analysis

In [1]:
# Install Ultralytics for YOLOv8 and YOLOv5
!pip install ultralytics

# Install OpenCV for video processing
!pip install opencv-python

# Install pandas for data manipulation
!pip install pandas

# Install matplotlib and seaborn for visualization
!pip install matplotlib seaborn

# Install PyTorch and related libraries for deep learning
!pip install torch torchvision torchaudio

# Install Hugging Face Transformers for NLP and computer vision models
!pip install transformers

# Install Hugging Face Datasets for data handling
!pip install datasets

# Install tqdm for progress bars
!pip install tqdm

^C
^C
^C


Import necessary libraries and AI models for , object detection, video processing, and data analysis

In [2]:
import os
import torch
import cv2
import time
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from ultralytics import YOLO
from transformers import BlipProcessor, BlipForConditionalGeneration
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

Data for analysing

In [3]:
# List of video files
video_files = [
    'SourcesTests/fruit-and-vegetable-detection.mp4',  # High resolution, good lighting
    'SourcesTests/traffic-mini.mp4',  # Medium resolution, low lighting
]

Function to extract frames from the video

In [4]:
def extract_frames(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    success, frame = cap.read()
    count = 0

    while success:
        if count % frame_interval == 0:
            frames.append(frame)
        success, frame = cap.read()
        count += 1

    cap.release()
    return frames

# Example usage
video_path = 'SourcesTests/fruit-and-vegetable-detection.mp4'  # Update with your video path
frames = extract_frames(video_path)

Load the BLIP processor and model for Video Question Answering

In [5]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base")

# Set the model to evaluation mode
model.eval()

# Function to perform Video Question Answering using BLIP
def blip_video_qa(frames, question):
    answers = []

    for i, frame in enumerate(frames):
        # Convert frame to PIL Image
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        # Prepare inputs for the model
        inputs = processor(image, question, return_tensors="pt")

        # Generate the answer
        output = model.generate(**inputs)
        answer = processor.decode(output[0], skip_special_tokens=True)
        answers.append((i, answer))  # Store frame index and answer

    return answers

# Define your question
question = "When in the video does it show the apple for the first time?"

# Perform Video QA using BLIP
blip_answers = blip_video_qa(frames, question)

KeyboardInterrupt: 

 Measure processing time for BLIP

In [None]:
import time

start_time = time.time()
blip_answers = blip_video_qa(frames, question)
blip_processing_time = time.time() - start_time

print(f"BLIP Processing Time: {blip_processing_time:.2f} seconds")

Function for Object Detection in Videos Using YOLOv8

In [None]:
import contextlib
import io
def yolo_v8_detection(video_path, user_object, frame_interval=5):
    model = YOLO('yolov8s.pt')  # Load the YOLOv8 model
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    timestamps = []
    processing_times = []
    class_names = model.names
    with contextlib.redirect_stdout(io.StringIO()):
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
    
            if frame_count % frame_interval != 0:
                continue
    
            start_time = time.time()
            results = model(frame)
            end_time = time.time()
            processing_times.append(end_time - start_time)
    
            for result in results:
                boxes = result.boxes
                for box in boxes:
                    class_id = int(box.cls[0])
                    class_name = class_names[class_id]
                    if class_name == user_object:
                        timestamp = frame_count / fps
                        timestamps.append(timestamp)
                        break

    cap.release()
    avg_processing_time = np.mean(processing_times)

    return {
        'timestamps': sorted(set(timestamps)),
        'processing_time': avg_processing_time,
        'frames_processed': frame_count // frame_interval
    }

Function for Object Detection in Videos Using YOLOv5

In [None]:
import contextlib
import io
def yolo_v5_detection(video_path, user_object, frame_interval=5):
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', force_reload=True)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    timestamps = []
    processing_times = []
    class_names = model.names
    with contextlib.redirect_stdout(io.StringIO()):
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
            frame_count += 1
    
            if frame_count % frame_interval != 0:
                continue
    
            start_time = time.time()
            results = model(frame)
            end_time = time.time()
            processing_times.append(end_time - start_time)
    
            # Parse detection results
            labels = results.xyxy[0][:, -1].cpu().numpy()
            for label in labels:
                class_name = class_names[int(label)]
                if class_name == user_object:
                    timestamp = frame_count / fps
                    timestamps.append(timestamp)
                    break

    cap.release()
    avg_processing_time = np.mean(processing_times)

    return {
        'timestamps': sorted(set(timestamps)),
        'processing_time': avg_processing_time,
        'frames_processed': frame_count // frame_interval
    }

Script for Running Object Detection and Video Question Answering with YOLOv8, YOLOv5, and BLIP
Table of Data analyst

In [None]:
video_user_objects = {
    'SourcesTests/fruit-and-vegetable-detection.mp4': 'apple',
    'SourcesTests/traffic-mini.mp4': 'truck',
    # Add more mappings if needed
}
# List to store results
results_list = []

for video in video_files:
    print(f"Processing {video}...")
    
    user_object = video_user_objects.get(video, None)
    if user_object is None:
        print(f"No user_object specified for {video}. Skipping this video.")
        continue
    
    # YOLOv8 Detection
    yolo_v8_results = yolo_v8_detection(video, user_object)
    
    
    results_list.append({
        'Video': video,
        'Tool': 'YOLOv8',
        'Processing Time (s)': yolo_v8_results['processing_time'],
        'Frames Processed': yolo_v8_results['frames_processed']
    })
    
    # YOLOv5 Detection
    yolo_v5_results = yolo_v5_detection(video, user_object)
  
    
    results_list.append({
        'Video': video,
        'Tool': 'YOLOv5',
        'Processing Time (s)': yolo_v5_results['processing_time'],
        'Frames Processed': yolo_v5_results['frames_processed']
    })
    # BLIP Detection (using Video QA)
    start_time = time.time()
    blip_answers = blip_video_qa(extract_frames(video), "When in the video its show on the first time the apple?")
    blip_processing_time = time.time() - start_time

    results_list.append({
        'Video': video,
        'Tool': 'BLIP',
        'Processing Time (s)': blip_processing_time,
        'Frames Processed': len(frames)
    })
    results_df = pd.DataFrame(results_list)
    pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
    pd.set_option('display.expand_frame_repr', False)  # Avoid line breaks in the table display
    pd.set_option('display.colheader_justify', 'center')  # Center-align column headers
    
    # Print the results DataFrame
    print("\nSummary of Processing Results:")
    print(results_df.to_string(index=False))  # Display the DataFrame in a readable format
    
    # Plot Processing Time Comparison for YOLOv8, YOLOv5, and BLIP
    plt.figure(figsize=(12, 6))
    sns.barplot(data=results_df, x='Tool', y='Processing Time (s)', hue='Video')
    plt.title('Processing Time Comparison for YOLOv8, YOLOv5, and BLIP')
    plt.xlabel('Detection Tool')
    plt.ylabel('Processing Time (seconds)')
    plt.legend(title='Video File', loc='upper right')
    plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
    plt.tight_layout()  # Adjust layout for better fit
    plt.show()

sBert vs. Bert

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()


extract embeddings from BERT using the [CLS] token, which will serve as the sentence embedding.

In [None]:
# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding


compute the similarity between different queries using cosine similarity to show how BERT handles semantic similarity.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Get embeddings for similar and different queries
query1 = "Explain the role of backpropagation in deep learning."
query2 = "How do convolutional neural networks work?"
query3 = "What is the process to register for courses at the university?"

embedding1 = get_cls_embedding(query1)
embedding2 = get_cls_embedding(query2)
embedding3 = get_cls_embedding(query3)

# Compute cosine similarities
similarity_1_2 = cosine_similarity(embedding1, embedding2)
similarity_1_3 = cosine_similarity(embedding1, embedding3)

print(f"Similarity between query 1 and query 2: {similarity_1_2[0][0]}")
print(f"Similarity between query 1 and query 3: {similarity_1_3[0][0]}")


text retrevial using BERT we embedd the documents with a cls token which respresents the sentence and we then do a similarity check between the query .
since the query is about backpropagtion document 1 and 3 should have a higher similarity than doc 2 which is about vpb

In [None]:
# Example documents
doc1 = "Neural networks are computing systems inspired by the biological neural networks."
doc2 = "To connect to the university VPN, you need to configure your VPN client."
doc3 = "Backpropagation is a fundamental algorithm in training deep learning models."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How does backpropagation work in neural networks?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


In [6]:
# Example documents
doc1 = "Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose, treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make more informed decisions. Despite these advances, there are challenges such as data privacy and the need for comprehensive validation of AI models before widespread adoption."
doc2 = "Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to ensure long-term sustainability."
doc3 = "Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes, where each node represents a neuron. The training of neural networks involves adjusting weights based on input data, allowing the model to learn patterns and make predictions. Despite their success, training large neural networks requires significant computational power and data."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


NameError: name 'get_cls_embedding' is not defined

as you can see the results are as expected

In [None]:
# Import necessary libraries
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding

# Example documents (longer text)
doc1 = """
Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose, 
treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical 
images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has 
reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered 
robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make 
more informed decisions. Despite these advances, there are challenges such as data privacy and the need 
for comprehensive validation of AI models before widespread adoption.
"""

doc2 = """
Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure 
that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in 
on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them 
to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and 
collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns 
about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to 
ensure long-term sustainability.
"""

doc3 = """
Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their 
inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in 
the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made 
it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural 
language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes, 
where each node represents a neuron. The training of neural networks involves adjusting weights based on input 
data, allowing the model to learn patterns and make predictions. Despite their success, training large neural 
networks requires significant computational power and data.
"""

doc4 = """
As the world faces the growing threat of climate change, sustainable energy has become a major focus of global 
efforts. Renewable energy sources such as solar, wind, and hydropower are being developed to reduce dependence 
on fossil fuels. Clean technologies are playing a critical role in achieving sustainability goals, with innovations 
in energy storage, electric vehicles, and smart grids leading the way. Governments and private companies alike are 
investing heavily in research and development to create more efficient and cost-effective solutions. While the 
transition to sustainable energy presents challenges, including the initial cost of infrastructure and the need for 
reliable energy storage, it also offers immense benefits in terms of reducing greenhouse gas emissions and creating 
new economic opportunities.
"""

# Query to compare with the documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

# Get embeddings for documents and query
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)
doc_embedding4 = get_cls_embedding(doc4)
query_embedding = get_cls_embedding(query)

# Compute cosine similarities between the query and each document
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)
similarity_doc4 = cosine_similarity(query_embedding, doc_embedding4)

# Show similarity results
print(f"Similarity with Document 1 (AI in Healthcare): {similarity_doc1[0][0]}")
print(f"Similarity with Document 2 (Cloud Computing): {similarity_doc2[0][0]}")
print(f"Similarity with Document 3 (Neural Networks in AI): {similarity_doc3[0][0]}")
print(f"Similarity with Document 4 (Sustainable Energy): {similarity_doc4[0][0]}")


the issue with BERT was he computed based on the word and which had the most occurences

# Documents Parsing

Install Required Packages

In [None]:
!sudo apt-get update
!sudo apt-get install -y tesseract-ocr
!sudo apt-get install -y tesseract-ocr-heb
# Install Python packages
!pip install pytesseract
!pip install langdetect
!pip install Pillow
!pip install pandas
!pip install PyPDF2
!pip install python-pptx
!pip install python-docx
!pip install pdfminer.six

# Verify installed languages
!tesseract --list-langs

Import LIB

In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import pandas as pd
import os
import glob
from google.colab import drive
import re
from langdetect import detect_langs, DetectorFactory
from pptx import Presentation
from docx import Document
from pdfminer.high_level import extract_text

Extract the language that i need to extract from image, from extended lang_map

In [None]:
# Ensure consistent language detection
DetectorFactory.seed = 0

# Extended language map (detect to Tesseract)
lang_map = {
    'en': 'eng',    # English
    'es': 'spa',    # Spanish
    'fr': 'fra',    # French
    'de': 'deu',    # German
    'it': 'ita',    # Italian
    'pt': 'por',    # Portuguese
    'ru': 'rus',    # Russian
    'zh-cn': 'chi_sim',  # Simplified Chinese
    'zh-tw': 'chi_tra',  # Traditional Chinese
    'ja': 'jpn',    # Japanese
    'ko': 'kor',    # Korean
    'ar': 'ara',    # Arabic
    'he': 'heb',    # Hebrew
    'fa': 'fas',    # Persian (Farsi)
    'hi': 'hin',    # Hindi
    'th': 'tha',    # Thai
    'vi': 'vie',    # Vietnamese
    'nl': 'nld',    # Dutch
    'tr': 'tur',    # Turkish
    'pl': 'pol',    # Polish
    'uk': 'ukr',    # Ukrainian
    'ro': 'ron',    # Romanian
    'bg': 'bul',    # Bulgarian
    'el': 'ell',    # Greek
    'ur': 'urd',    # Urdu
    # Add more languages as needed
}



Extraxt txt, PDF, docs, pptx files

In [None]:
def read_txt_file(file_path, encoding='utf-8'):
    try:
        with open(file_path, 'r', encoding=encoding) as file:
            lines = [line.rstrip('\n') for line in file]
        return lines
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except IOError as e:
        print(f"An I/O error occurred: {e}")
    return []

def read_pdf_with_pdfminer(file_path):
    try:
        text = extract_text(file_path)
        lines = text.splitlines()
        return lines
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

def read_pptx_file(file_path):
    try:
        prs = Presentation(file_path)
        text_runs = []
        for slide in prs.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_runs.append(shape.text)
        return text_runs
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

def read_docx_file(file_path):
    try:
        doc = Document(file_path)
        text = []
        for para in doc.paragraphs:
            text.append(para.text)
        return text
    except FileNotFoundError:
        print(f"File {file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return []

Extract txt from image

In [None]:
def preprocess_image(image_path):
    """
    Preprocesses an image to improve OCR accuracy.

    Steps:
    - Convert to grayscale
    - Apply median filter for noise reduction
    - Enhance contrast
    - Binarize the image using thresholding
    """
    try:
        image = Image.open(image_path).convert('L')  # Convert to grayscale
        image = image.filter(ImageFilter.MedianFilter())  # Reduce noise
        enhancer = ImageEnhance.Contrast(image)
        image = enhancer.enhance(2)  # Enhance contrast
        image = image.point(lambda x: 0 if x < 140 else 255, '1')  # Binarization
        return image
    except Exception as e:
        print(f"Error preprocessing {image_path}: {e}")
        return None


def extract_text_multiple_languages(image, languages=['eng', 'heb']):
    """
    Extracts text from an image using Tesseract OCR with multiple languages.

    Args:
        image (PIL.Image): Preprocessed image.
        languages (list): List of language codes to use for OCR.

    Returns:
        str: Extracted text.
    """
    try:
        # Join language codes with '+' for Tesseract
        lang_param = '+'.join(languages)
        text = pytesseract.image_to_string(image, lang=lang_param)
        return text.strip()
    except Exception as e:
        print(f"Error during OCR with multiple languages: {e}")
        return ""



def further_clean_text(text):
    """
    Further cleans the extracted text by removing unwanted characters.

    Args:
        text (str): Extracted text.

    Returns:
        str: Cleaned text.
    """
    # Remove non-printable characters
    text = ''.join(filter(lambda x: x.isprintable(), text))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Supported image formats
image_extensions = ['*.png', '*.jpg', '*.jpeg', '*.tiff', '*.bmp']

# Gather all image file paths
image_paths = ['/content/Screenshot 2024-09-16 at 11.59.44.png']


print(f"Found {len(image_paths)} images.")




Func to define the file type to ectract the txt from file

In [None]:
import os

# Initialize lists to store results
extracted_text = []
detected_file_types = []
file_names = []

# Define desired languages for OCR
desired_languages = ['eng', 'heb', 'spa']  # Add more as needed

def extract_text_from_file(file_path, use_pdfminer=False):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()

    if file_extension == '.txt':
        text = read_txt_file(file_path)
        return text, "txt"

    elif file_extension == '.pdf':
        if use_pdfminer:
            text = read_pdf_with_pdfminer(file_path)
        else:
            # If you have another PDF reader, integrate it here
            text = read_pdf_with_pdfminer(file_path)
        return text, "pdf"

    elif file_extension == '.pptx':
        text = read_pptx_file(file_path)
        return text, "pptx"

    elif file_extension == '.docx':
        text = read_docx_file(file_path)
        return text, "docx"

    elif file_extension in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        # Process image with OCR
        preprocessed_img = preprocess_image(file_path)
        if preprocessed_img is None:
            return "", "preprocessing_failed"
        text = extract_text_multiple_languages(preprocessed_img, languages=desired_languages)
        return text, "image"

    else:
        print(f"Unsupported file type: {file_extension}")
        return "", "unsupported"


The main part we put the file path into (uploaded list) and extract the txt from it.

In [None]:
# Your list of files to process
uploaded = ['/content/Screenshot 2024-09-16 at 11.59.44.png','/content/תשובות.docx','/content/Full Stack Project.pdf']

# Process all uploaded files
for idx, filename in enumerate(uploaded):
    print(f"Processing File {idx + 1}/{len(uploaded)}: {filename}")
    try:
        text, file_type = extract_text_from_file(filename, use_pdfminer=True)
        if text:
            # For text files, text may be a list of lines; for images, text is a string
            if isinstance(text, list):
                cleaned_text = "\n".join([further_clean_text(line) for line in text])
            else:
                cleaned_text = further_clean_text(text)
            extracted_text.append(cleaned_text)
            detected_file_types.append(file_type)
            file_names.append(os.path.basename(filename))

            # Print a preview of the extracted text
            #preview = cleaned_text[:100].replace('\n', ' ') + ('...' if len(cleaned_text) > 100 else '')
        
            #print the extracted txt
            print(f"Extracted Text Preview:")
            print(cleaned_text)
        else:
            print("No text extracted.")
            extracted_text.append("")
            detected_file_types.append("no_text_extracted")
            file_names.append(os.path.basename(filename))
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        extracted_text.append("")
        detected_file_types.append("error")
        file_names.append(os.path.basename(filename))
    print("-" * 50)
