In [7]:
import torch
from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
from tqdm import tqdm

torch.set_float32_matmul_precision("high")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
).to(device)

# Enable static cache and compile the forward pass
model.generation_config.cache_implementation = "static"
model.generation_config.max_new_tokens = 256
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

dataset = "harvard.wav"
sample = dataset[0]["audio"]

# 2 warmup steps
for _ in tqdm(range(2), desc="Warm-up step"):
    with sdpa_kernel(SDPBackend.MATH):
        result = pipe(sample.copy(), generate_kwargs={"min_new_tokens": 256, "max_new_tokens": 256})

# fast run
with sdpa_kernel(SDPBackend.MATH):
    result = pipe(sample.copy())

print(result["text"])


Device set to use cuda:0


TypeError: string indices must be integers, not 'str'

In [6]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1




In [3]:
import streamlit as st

st.title("Interactive Print Output Analyzer")

uploaded_file = st.file_uploader("Upload an HTML, Excel, or PDF file", type=['html', 'xls', 'xlsx', 'pdf'])

if uploaded_file is not None:
    if uploaded_file.name.endswith(('html', 'xls', 'xlsx')):
        output_pdf = 'converted.pdf'
        convert_to_pdf(uploaded_file, output_pdf)
        pdf_file = output_pdf
    else:
        pdf_file = uploaded_file

    text, images = extract_text_and_images(pdf_file)
    font_sizes = analyze_font_sizes(pdf_file)

    st.header("Extracted Text")
    st.write(text)

    st.header("Extracted Images")
    for img in images:
        st.image(img)

    st.header("Font Sizes")
    st.write(font_sizes)


2025-02-21 04:41:06.944 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]


In [1]:
!pip install pytesseract

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
!sudo apt install tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 21 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 2s (2,824 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debc

In [3]:
import cv2
import pytesseract
import numpy as np
from PIL import Image

# Path to the image file
image_path = '/content/demo AI image.png'

# Load the image using OpenCV
image = cv2.imread(image_path)

# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply thresholding to preprocess the image
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)

# Use morphological operations to remove noise and improve text regions
kernel = np.ones((1, 1), np.uint8)
morph = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)

# Invert the image back
processed_image = cv2.bitwise_not(morph)

# Save the processed image temporarily
temp_image_path = 'temp_processed_image.png'
cv2.imwrite(temp_image_path, processed_image)

# Perform OCR using Tesseract
# If you're using Windows, specify the path to the tesseract executable:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
extracted_text = pytesseract.image_to_string(Image.open(temp_image_path))

# Display the extracted text
print("Extracted Text:")
print(extracted_text)


Extracted Text:
LOY LO yuUaye

KRA~-,|] ~lA



In [4]:
pip install easyocr

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.11.1.3-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->easyocr)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (

In [5]:
from easyocr import Reader

# Initialize the EasyOCR reader with the desired language(s)
reader = Reader(['en'])  # 'en' stands for English

# Path to the image file
image_path = '/content/demo AI image.png'

# Perform OCR on the image
results = reader.readtext(image_path)

# Extract and print text
extracted_text = ' '.join([text for _, text, _ in results])
print("Extracted Text:")
print(extracted_text)




Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [6]:
import cv2
import numpy as np
from PIL import Image
import pytesseract

# Load the image
# image_path = 'path_to_your_image.jpg'  # Replace with the correct path to your image file
image_path = '/content/demo AI image.png' # Corrected path to the image
image = cv2.imread(image_path)

# Check if image loaded successfully
if image is None:
    print(f"Error: Could not load image from {image_path}. Please check the file path and ensure the image exists.")
    exit()  # Exit the script if image loading fails

# Convert to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply adaptive thresholding
adaptive_thresh = cv2.adaptiveThreshold(
    gray, 255,
    cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
    cv2.THRESH_BINARY,
    11, 2
)

# Save the processed image temporarily
temp_image_path = 'temp_processed_image.png'
cv2.imwrite(temp_image_path, adaptive_thresh)

# Perform OCR using Tesseract
extracted_text = pytesseract.image_to_string(Image.open(temp_image_path))

# Display the extracted text
print("Extracted Text:")
print(extracted_text)

Extracted Text:
RCW GH yudaye

ALAn-A-laA



In [7]:
# Apply median blurring to reduce noise
denoised = cv2.medianBlur(gray, 3)

# Continue with thresholding and OCR as before


In [8]:
custom_config = r'--oem 3 --psm 6'
extracted_text = pytesseract.image_to_string(Image.open(temp_image_path), config=custom_config)


In [9]:
from easyocr import Reader

# Initialize the EasyOCR reader
reader = Reader(['en'])  # Specify the language code

# Perform OCR on the image
results = reader.readtext('/content/demo AI image.png')

# Extract and print text
extracted_text = ' '.join([text for _, text, _ in results])
print("Extracted Text:")
print(extracted_text)


Extracted Text:
Laiyc Languayc AAAelc


In [10]:
import nltk

# Download the 'punkt_tab' data package if not already downloaded
try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(extracted_text)
for sentence in sentences:
    print(sentence)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Laiyc Languayc AAAelc


In [11]:
pip install transformers torch



In [1]:
extracted_text = "your_extracted_text_here"
prompt = f"Organize the following text into coherent sentences:\n\n{extracted_text}"


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the GPT-J tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")


In [None]:
import torch

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt")

# Generate text with the model
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=500)

# Decode the generated text
organized_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Organized Text:")
print(organized_text)


In [13]:
import speech_recognition as sr
import pyttsx3
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import PyAudio

# Initialize text-to-speech engine
engine = pyttsx3.init()
engine.setProperty('rate', 150)  # Speed of speech

# Load a pre-trained conversational model (e.g., DialoGPT from Microsoft)
tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Initialize speech recognizer
recognizer = sr.Recognizer()

def listen():
    with sr.Microphone() as source:
        print("Listening...")
        audio = recognizer.listen(source)
        try:
            text = recognizer.recognize_google(audio)  # Use Google's free API
            print(f"You: {text}")
            return text
        except:
            return ""

def respond(text):
    # Generate a response using DialoGPT
    chat_history_ids = None
    inputs = tokenizer.encode(text + tokenizer.eos_token, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature=0.8
    )
    response = tokenizer.decode(outputs[:, inputs.shape[-1]:][0], skip_special_tokens=True)
    print(f"AI: {response}")
    engine.say(response)
    engine.runAndWait()

if __name__ == "__main__":
    while True:
        user_input = listen()
        if user_input.lower() in ["exit", "quit", "stop"]:
            engine.say("Goodbye!")
            engine.runAndWait()
            break
        if user_input:
            respond(user_input)

ModuleNotFoundError: No module named 'PyAudio'

In [2]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [4]:
!pip install pyttsx3

Collecting pyttsx3
  Downloading pyttsx3-2.98-py3-none-any.whl.metadata (3.8 kB)
Downloading pyttsx3-2.98-py3-none-any.whl (34 kB)
Installing collected packages: pyttsx3
Successfully installed pyttsx3-2.98


In [6]:
!sudo apt install espeak espeak-ng  # For Debian/Ubuntu systems

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  espeak-data espeak-ng-data libespeak-ng1 libespeak1 libpcaudio0
  libportaudio2 libsonic0
The following NEW packages will be installed:
  espeak espeak-data espeak-ng espeak-ng-data libespeak-ng1 libespeak1
  libpcaudio0 libportaudio2 libsonic0
0 upgraded, 9 newly installed, 0 to remove and 21 not upgraded.
Need to get 5,897 kB of archives.
After this operation, 15.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudio2 amd64 19.6.0-1.1 [65.3 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 libsonic0 amd64 0.2.0-11build1 [10.3 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 espeak-data amd64 1.48.15+dfsg-3 [1,085 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libespeak1 amd64 1.48.15+dfsg-3 [156 kB]
Get:5 http://archive.ubuntu.com/u

In [11]:
!pip install PyAudio



In [9]:
!sudo apt-get install portaudio19-dev python3-pyaudio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libportaudiocpp0
Suggested packages:
  portaudio19-doc python-pyaudio-doc
The following NEW packages will be installed:
  libportaudiocpp0 portaudio19-dev python3-pyaudio
0 upgraded, 3 newly installed, 0 to remove and 21 not upgraded.
Need to get 148 kB of archives.
After this operation, 820 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libportaudiocpp0 amd64 19.6.0-1.1 [16.1 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 portaudio19-dev amd64 19.6.0-1.1 [106 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 python3-pyaudio amd64 0.2.11-1.3ubuntu1 [25.9 kB]
Fetched 148 kB in 1s (157 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5

In [14]:
!sudo apt-get install portaudio19-dev python3-pyaudio  # Install the necessary dependencies
!pip install PyAudio  # Install PyAudio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
portaudio19-dev is already the newest version (19.6.0-1.1).
python3-pyaudio is already the newest version (0.2.11-1.3ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.


In [None]:
# pip install transformers peft librosa

import transformers
import numpy as np
import librosa

pipe = transformers.pipeline(model='fixie-ai/ultravox-v0_3', trust_remote_code=True)

path = "<path-to-input-audio>"  # TODO: pass the audio here
audio, sr = librosa.load(path, sr=16000)


turns = [
  {
    "role": "system",
    "content": "You are a friendly and helpful character. You love to answer questions for people."
  },
]
pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=30)


config.json:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

ultravox_config.py:   0%|          | 0.00/5.74k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

ultravox_pipeline.py:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

ultravox_model.py:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

whisper_model_modified.py:   0%|          | 0.00/5.81k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- whisper_model_modified.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_model.py
- whisper_model_modified.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ultravox_processing.py:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_processing.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_pipeline.py
- ultravox_model.py
- ultravox_processing.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [8]:
!pip install pypiwin32

Collecting pypiwin32
  Downloading pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
INFO: pip is looking at multiple versions of pypiwin32 to determine which version is compatible with other requirements. This could take a while.
  Downloading pypiwin32-219.zip (4.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m3.6/4.8 MB[0m [31m104.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.8/4.8 MB[0m [31m107.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mno

In [19]:
import speech_recognition as sr
import json
from win32 import win32print
import pyttsx3

class VoicePrinterController:
    def __init__(self):
        # Initialize speech recognizer
        self.recognizer = sr.Recognizer()
        # Initialize text-to-speech engine
        self.speaker = pyttsx3.init()
        # Default printer settings
        self.settings = {
            "orientation": "portrait",
            "copies": 1,
            "quality": "normal",
            "color": True
        }

    def listen_for_command(self):
        """Listen for voice input and convert to text"""
        with sr.Microphone() as source:
            print("Listening for printer commands...")
            self.speaker.say("Listening for printer commands")
            self.speaker.runAndWait()

            try:
                audio = self.recognizer.listen(source, timeout=5)
                command = self.recognizer.recognize_google(audio)
                return command.lower()
            except sr.UnknownValueError:
                return "Could not understand audio"
            except sr.RequestError:
                return "Could not request results"

    def process_command(self, command):
        """Process voice command and update printer settings"""
        if "orientation" in command:
            if "landscape" in command:
                self.settings["orientation"] = "landscape"
            elif "portrait" in command:
                self.settings["orientation"] = "portrait"

        elif "copies" in command:
            try:
                number = [int(s) for s in command.split() if s.isdigit()][0]
                self.settings["copies"] = number
            except:
                self.speaker.say("Please specify number of copies")
                self.speaker.runAndWait()

        elif "quality" in command:
            if "draft" in command:
                self.settings["quality"] = "draft"
            elif "normal" in command:
                self.settings["quality"] = "normal"
            elif "high" in command:
                self.settings["quality"] = "high"

        elif "color" in command:
            if "on" in command or "enable" in command:
                self.settings["color"] = True
            elif "off" in command or "disable" in command:
                self.settings["color"] = False

        self.confirm_settings()

    def confirm_settings(self):
        """Confirm current settings via voice"""
        settings_text = f"Current settings: {json.dumps(self.settings, indent=2)}"
        print(settings_text)
        self.speaker.say(f"Settings updated. {settings_text}")
        self.speaker.runAndWait()

    def apply_settings(self):
        """Apply settings to default printer"""
        printer_name = win32print.GetDefaultPrinter()

        # Get printer handle
        printer_handle = win32print.OpenPrinter(printer_name)
        try:
            # Get printer properties
            properties = win32print.GetPrinter(printer_handle, 2)

            # Update settings based on voice commands
            devmode = properties["pDevMode"]
            if self.settings["orientation"] == "landscape":
                devmode.Orientation = 2
            else:
                devmode.Orientation = 1

            devmode.Copies = self.settings["copies"]

            # Apply quality settings
            quality_map = {"draft": 1, "normal": 2, "high": 3}
            devmode.Quality = quality_map[self.settings["quality"]]

            # Apply color settings
            devmode.Color = 1 if self.settings["color"] else 2

            # Save settings
            win32print.SetPrinter(printer_handle, 2, properties, 0)

        finally:
            win32print.ClosePrinter(printer_handle)

    def run(self):
        """Main loop to listen for and process commands"""
        while True:
            command = self.listen_for_command()

            if "exit" in command or "quit" in command:
                self.speaker.say("Exiting voice control")
                self.speaker.runAndWait()
                break

            if "apply" in command or "save" in command:
                self.apply_settings()
                continue

            self.process_command(command)

if __name__ == "__main__":
    controller = VoicePrinterController()
controller.run()

ModuleNotFoundError: No module named 'win32'

In [30]:
import speech_recognition as sr
import json
from win32 import win32print
import pyttsx3

class VoicePrinterController:
    def __init__(self):
        # Initialize speech recognizer
        self.recognizer = sr.Recognizer()
        # Initialize text-to-speech engine
        self.speaker = pyttsx3.init()
        # Default printer settings
        self.settings = {
            "orientation": "portrait",
            "copies": 1,
            "quality": "normal",
            "color": True
        }

    def listen_for_command(self):
        """Listen for voice input and convert to text"""
        with sr.Microphone() as source:
            print("Listening for printer commands...")
            self.speaker.say("Listening for printer commands")
            self.speaker.runAndWait()

            try:
                audio = self.recognizer.listen(source, timeout=5)
                command = self.recognizer.recognize_google(audio)
                return command.lower()
            except sr.UnknownValueError:
                return "Could not understand audio"
            except sr.RequestError:
                return "Could not request results"

    def process_command(self, command):
        """Process voice command and update printer settings"""
        if "orientation" in command:
            if "landscape" in command:
                self.settings["orientation"] = "landscape"
            elif "portrait" in command:
                self.settings["orientation"] = "portrait"

        elif "copies" in command:
            try:
                number = [int(s) for s in command.split() if s.isdigit()][0]
                self.settings["copies"] = number
            except:
                self.speaker.say("Please specify number of copies")
                self.speaker.runAndWait()

        elif "quality" in command:
            if "draft" in command:
                self.settings["quality"] = "draft"
            elif "normal" in command:
                self.settings["quality"] = "normal"
            elif "high" in command:
                self.settings["quality"] = "high"

        elif "color" in command:
            if "on" in command or "enable" in command:
                self.settings["color"] = True
            elif "off" in command or "disable" in command:
                self.settings["color"] = False

        self.confirm_settings()

    def confirm_settings(self):
        """Confirm current settings via voice"""
        settings_text = f"Current settings: {json.dumps(self.settings, indent=2)}"
        print(settings_text)
        self.speaker.say(f"Settings updated. {settings_text}")
        self.speaker.runAndWait()

    def apply_settings(self):
        """Apply settings to default printer"""
        printer_name = win32print.GetDefaultPrinter()

        # Get printer handle
        printer_handle = win32print.OpenPrinter(printer_name)
        try:
            # Get printer properties
            properties = win32print.GetPrinter(printer_handle, 2)

            # Update settings based on voice commands
            devmode = properties["pDevMode"]
            if self.settings["orientation"] == "landscape":
                devmode.Orientation = 2
            else:
                devmode.Orientation = 1

            devmode.Copies = self.settings["copies"]

            # Apply quality settings
            quality_map = {"draft": 1, "normal": 2, "high": 3}
            devmode.Quality = quality_map[self.settings["quality"]]

            # Apply color settings
            devmode.Color = 1 if self.settings["color"] else 2

            # Save settings
            win32print.SetPrinter(printer_handle, 2, properties, 0)

        finally:
            win32print.ClosePrinter(printer_handle)

    def run(self):
        """Main loop to listen for and process commands"""
        while True:
            command = self.listen_for_command()

            if "exit" in command or "quit" in command:
                self.speaker.say("Exiting voice control")
                self.speaker.runAndWait()
                break

            if "apply" in command or "save" in command:
                self.apply_settings()
                continue

            self.process_command(command)

if __name__ == "__main__":
    controller = VoicePrinterController()
    controller.run()

ModuleNotFoundError: No module named 'win32'

In [12]:
!pip install pypiwin32

Collecting pypiwin32
  Using cached pypiwin32-223-py3-none-any.whl.metadata (236 bytes)
INFO: pip is looking at multiple versions of pypiwin32 to determine which version is compatible with other requirements. This could take a while.
  Using cached pypiwin32-219.zip (4.8 MB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [31]:
from PyQt5.QtPrintSupport import QPrinter, QPrinterInfo

def apply_settings(self):
    """Apply settings to default printer"""
    try:
        printer = QPrinter(QPrinter.HighResolution)
        printer.setPrinterName(QPrinterInfo.defaultPrinter().printerName())
        printer.setFullPage(True)

        if self.settings["orientation"] == "landscape":
            printer.setOrientation(QPrinter.Landscape)
        else:
            printer.setOrientation(QPrinter.Portrait)

        printer.setNumCopies(self.settings["copies"])

        # Further options would require diving into Qt printing configuration
        # See the QtPrintSupport documentation for details.

    except Exception as e:
        print(f"Error setting printer properties: {e}")

ModuleNotFoundError: No module named 'PyQt5'

In [26]:
!pip install speech_recognition pyttsx3 pywin32

[31mERROR: Could not find a version that satisfies the requirement speech_recognition (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for speech_recognition[0m[31m
[0m

In [32]:
!pip install PyQt5

Collecting PyQt5
  Downloading PyQt5-5.15.11-cp38-abi3-manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Collecting PyQt5-sip<13,>=12.15 (from PyQt5)
  Downloading PyQt5_sip-12.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.whl.metadata (472 bytes)
Collecting PyQt5-Qt5<5.16.0,>=5.15.2 (from PyQt5)
  Downloading PyQt5_Qt5-5.15.16-py3-none-manylinux2014_x86_64.whl.metadata (536 bytes)
Downloading PyQt5-5.15.11-cp38-abi3-manylinux_2_17_x86_64.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyQt5_Qt5-5.15.16-py3-none-manylinux2014_x86_64.whl (59.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.9/59.9 MB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyQt5_sip-12.17.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.whl (276 kB)
Installing collected packages: PyQt5-Qt5, PyQt5-sip, PyQt5
Successfully installed PyQt5-5.15.11 PyQt5-Qt5-5.15.16 PyQt5-sip-

In [27]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m57.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.0.1


In [33]:
import speech_recognition as sr
import json
import pyttsx3
from PyQt5.QtPrintSupport import QPrinter, QPrinterInfo

class VoicePrinterController:
    def __init__(self):
        # Initialize speech recognizer
        self.recognizer = sr.Recognizer()
        # Initialize text-to-speech engine
        self.speaker = pyttsx3.init()
        # Default printer settings
        self.settings = {
            "orientation": "portrait",
            "copies": 1,
            "quality": "normal",  # Quality options will depend on Qt library capabilities
            "color": True
        }

    def listen_for_command(self):
        # ... (same as previous versions)

    def process_command(self, command):
        # ... (same as previous versions)

    def confirm_settings(self):
        # ... (same as previous versions)

    def apply_settings(self):
        """Apply settings to default printer"""
        try:
            printer = QPrinter(QPrinter.HighResolution)
            printer.setPrinterName(QPrinterInfo.defaultPrinter().printerName())
            printer.setFullPage(True)

            if self.settings["orientation"] == "landscape":
                printer.setOrientation(QPrinter.Landscape)
            else:
                printer.setOrientation(QPrinter.Portrait)

            printer.setNumCopies(self.settings["copies"])

            # Further options would require diving into Qt printing configuration
            # See the QtPrintSupport documentation for details.

        except Exception as e:
            print(f"Error setting printer properties: {e}")

    def run(self):
        # ... (same as previous versions)

if __name__ == "__main__":
    controller = VoicePrinterController()
    controller.run()

IndentationError: expected an indented block after function definition on line 20 (<ipython-input-33-bc6bd8c5146f>, line 23)

In [16]:
!pip install pywin32

[31mERROR: Could not find a version that satisfies the requirement pywin32 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pywin32[0m[31m
[0m

In [28]:
!pip install SpeechRecognition==3.10.0

Collecting SpeechRecognition==3.10.0
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl.metadata (28 kB)
Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
  Attempting uninstall: SpeechRecognition
    Found existing installation: SpeechRecognition 3.14.1
    Uninstalling SpeechRecognition-3.14.1:
      Successfully uninstalled SpeechRecognition-3.14.1
Successfully installed SpeechRecognition-3.10.0


In [17]:
!sudo apt install build-essential python3-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
python3-dev is already the newest version (3.10.6-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 21 not upgraded.


In [29]:
!pip install --upgrade pip
!pip install SpeechRecognition==3.10.0



In [18]:
!pip cache purge

Files removed: 2


In [24]:
from gtts import gTTS
from playsound import playsound

def speak(text):
    tts = gTTS(text=text, lang='en')
    tts.save("output.mp3")
    playsound("output.mp3")

speak("Hello, this is a text-to-speech example using gtts.")



CalledProcessError: Command '['/usr/bin/python3', '/usr/local/lib/python3.11/dist-packages/playsound.py', 'output.mp3']' returned non-zero exit status 1.

In [23]:
!pip install playsound

Collecting playsound
  Downloading playsound-1.3.0.tar.gz (7.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: playsound
  Building wheel for playsound (setup.py) ... [?25l[?25hdone
  Created wheel for playsound: filename=playsound-1.3.0-py3-none-any.whl size=7020 sha256=5f8a7edcac30d24ac1cd68263932c0a3f5fe54c236310f33ff0d749f21d88563
  Stored in directory: /root/.cache/pip/wheels/50/98/42/62753a9e1fb97579a0ce2f84f7db4c21c09d03bb2091e6cef4
Successfully built playsound
Installing collected packages: playsound
Successfully installed playsound-1.3.0


In [1]:
import os
import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_path
import docx2pdf
from PIL import Image
import logging
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
from tkinter import *
from tkinter import messagebox
import hashlib
import json
import win32print
import win32api
from datetime import datetime

class LogoDetector:
    def __init__(self, logo_directory):
        self.logo_directory = logo_directory
        self.logo_features = self.load_logo_features()

    def load_logo_features(self):
        """Load and process all logo templates"""
        features = {}
        for logo_file in os.listdir(self.logo_directory):
            if logo_file.endswith(('.png', '.jpg', '.jpeg')):
                logo_path = os.path.join(self.logo_directory, logo_file)
                logo = cv2.imread(logo_path, cv2.IMREAD_COLOR)
                if logo is not None:
                    # Use SIFT for feature detection
                    sift = cv2.SIFT_create()
                    keypoints, descriptors = sift.detectAndCompute(logo, None)
                    features[logo_file] = {
                        'keypoints': keypoints,
                        'descriptors': descriptors
                    }
        return features

    def detect_logo(self, image):
        """Detect if any known logo is present in the image"""
        sift = cv2.SIFT_create()
        img_keypoints, img_descriptors = sift.detectAndCompute(image, None)

        if img_descriptors is None:
            return False

        for logo_name, logo_data in self.logo_features.items():
            # FLANN based matcher
            FLANN_INDEX_KDTREE = 1
            index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
            search_params = dict(checks=50)
            flann = cv2.FlannBasedMatcher(index_params, search_params)

            matches = flann.knnMatch(logo_data['descriptors'], img_descriptors, k=2)

            # Apply ratio test
            good_matches = []
            for m, n in matches:
                if m.distance < 0.7 * n.distance:
                    good_matches.append(m)

            if len(good_matches) > 10:  # Threshold for positive detection
                return True

        return False

class SecurePrintMonitor(FileSystemEventHandler):
    def __init__(self, watch_directory, credentials_file, logo_directory):
        self.watch_directory = watch_directory
        self.credentials_file = credentials_file
        self.logo_detector = LogoDetector(logo_directory)
        self.sensitive_extensions = ['.pdf', '.docx', '.xlsx', '.txt', '.jpg', '.png']
        self.setup_logging()
        self.load_credentials()

    def setup_logging(self):
        logging.basicConfig(
            filename='print_monitor.log',
            level=logging.INFO,
            format='%(asctime)s - %(message)s'
        )

    def load_credentials(self):
        try:
            with open(self.credentials_file, 'r') as f:
                self.credentials = json.load(f)
        except FileNotFoundError:
            self.credentials = {
                'admin': {
                    'password': self.hash_password('admin123'),
                    'role': 'admin'
                }
            }
            self.save_credentials()

    def convert_to_image(self, filepath):
        """Convert document to image for logo detection"""
        if filepath.lower().endswith('.pdf'):
            return convert_from_path(filepath)[0]
        elif filepath.lower().endswith('.docx'):
            # Convert DOCX to PDF first
            pdf_path = filepath.rsplit('.', 1)[0] + '.pdf'
            docx2pdf.convert(filepath, pdf_path)
            return convert_from_path(pdf_path)[0]
        elif filepath.lower().endswith(('.jpg', '.jpeg', '.png')):
            return Image.open(filepath)
        return None

    def check_for_logo(self, filepath):
        """Check if document contains any known logos"""
        try:
            # Convert document to image
            image = self.convert_to_image(filepath)
            if image is None:
                return False

            # Convert PIL Image to OpenCV format
            opencv_image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

            # Detect logo
            return self.logo_detector.detect_logo(opencv_image)

        except Exception as e:
            logging.error(f"Error checking for logo: {str(e)}")
            return False

    def on_created(self, event):
        if not event.is_directory and self.is_sensitive_file(event.src_path):
            # First check for logo
            if self.check_for_logo(event.src_path):
                logging.info(f"Logo detected in file: {event.src_path}")
                if self.authenticate_print(event.src_path):
                    self.handle_print(event.src_path)
                else:
                    logging.warning(f"Unauthorized print attempt blocked: {event.src_path}")
            else:
                # No logo detected, proceed with normal printing
                self.handle_print(event.src_path)

    # ... (rest of the SecurePrintMonitor class remains the same as previous version)

def setup_monitor(watch_directory=".", logo_directory="logos"):
    """Setup and start the secure print monitor"""
    # Create logo directory if it doesn't exist
    if not os.path.exists(logo_directory):
        os.makedirs(logo_directory)
        print(f"Created logo directory: {logo_directory}")
        print("Please add your logo files (.png, .jpg) to this directory")

    monitor = SecurePrintMonitor(watch_directory, "credentials.json", logo_directory)
    observer = Observer()
    observer.schedule(monitor, watch_directory, recursive=False)
    observer.start()

    print("Secure Print Monitor Started")
    print("Watching for sensitive documents and logos...")
    print("Press Ctrl+C to stop")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

if __name__ == "__main__":
    setup_monitor()


ModuleNotFoundError: No module named 'pytesseract'

In [None]:
# pip install transformers peft librosa

import transformers
import numpy as np
import librosa

pipe = transformers.pipeline(model='fixie-ai/ultravox-v0_3', trust_remote_code=True)

path = "/content/harvard.wav"  # TODO: pass the audio here
audio, sr = librosa.load(path, sr=16000)


turns = [
  {
    "role": "system",
    "content": "You are a friendly and helpful character. You love to answer questions for people."
  },
]
pipe({'audio': audio, 'turns': turns, 'sampling_rate': sr}, max_new_tokens=30)


config.json:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

ultravox_config.py:   0%|          | 0.00/5.74k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_config.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

ultravox_pipeline.py:   0%|          | 0.00/4.62k [00:00<?, ?B/s]

ultravox_model.py:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

whisper_model_modified.py:   0%|          | 0.00/5.81k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- whisper_model_modified.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_model.py
- whisper_model_modified.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


ultravox_processing.py:   0%|          | 0.00/9.19k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_processing.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/fixie-ai/ultravox-v0_3:
- ultravox_pipeline.py
- ultravox_model.py
- ultravox_processing.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/28.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.29G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [1]:
pip install transformers peft librosa

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting librosa
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.0-cp312-cp312-win_amd64.whl.metadata (2.8 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpack-1.1.0-cp312-cp312-win_amd64.whl.metadata (8.6 kB)
Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.44.0-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Do