In [1]:
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import torchaudio
import pyaudio
import wave
import ollama


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    device_info = p.get_device_info_by_index(i)
    print(device_info)
    

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (Parsec Virtual Audi', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microphone (NVIDIA Broadcast)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Microphone (Steam Streaming Mic', 'hostApi': 0, 'maxInputChannel

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "distil-whisper/distil-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)


FORMAT = pyaudio.paInt16        # 16-bit resolution
CHANNELS = 1                    # Mono audio
RATE = 44100                    # 44.1kHz sampling rate
CHUNK = 1024                    # Number of frames per buffer
TEMP_FILENAME = "temp_recording.wav"

In [4]:
def ollamaFunction(content):
  response = ollama.chat(model='llama3.1', messages=[
    {
      'role': 'user',
      'content': content + "Here",
    },
  ])
  print(response['message']['content'])

In [8]:
def chunk_waveform(waveform, chunk_size):
    num_chunks = waveform.size(1) // chunk_size
    return torch.chunk(waveform, num_chunks + 1, dim=1)

def preprocess_audio(waveform, processor, sample_rate=16000):
    input_features = processor(
        waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt"
    ).input_features
    return input_features

def generate_transcription(model, processor, input_features, torch_dtype=torch.float32):
    # Convert input features to the appropriate dtype if necessary
    input_features = input_features.to(torch_dtype)
    
    # Ensure model is also in the appropriate dtype
    model = model.to(torch_dtype)
    
    # Generate transcription
    predicted_ids = model.generate(input_features)
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

def audio_process(wav_file_path, model, processor, torch_dtype=torch.float32, device="cpu"):
    waveform, sample_rate = torchaudio.load(wav_file_path)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    chunk_size = 16000 * 30  # 30 seconds at 16000 Hz
    waveform_chunks = chunk_waveform(waveform, chunk_size)

    transcription = []

    for chunk in waveform_chunks:
        input_features = preprocess_audio(chunk, processor, sample_rate=16000)

        # Convert input features to the appropriate device and dtype
        input_features = input_features.to(device).to(torch_dtype)

        # Ensure model is in the appropriate device and dtype
        model = model.to(device).to(torch_dtype)

        chunk_transcription = generate_transcription(model, processor, input_features, torch_dtype)

        transcription.append(chunk_transcription)

    return " ".join(transcription)

In [12]:
import pyaudio
import wave

# Constants
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 1024
OUTPUT_FILENAME = "output.wav"

def record_audio():
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

    print("Recording started. Press Ctrl+C to stop.")
    frames = []

    try:
        while True:
            data = stream.read(CHUNK)
            frames.append(data)
    except KeyboardInterrupt:
        print("\nRecording stopped by user.")
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

        with wave.open(OUTPUT_FILENAME, 'wb') as wf:
            wf.setnchannels(CHANNELS)
            wf.setsampwidth(p.get_sample_size(FORMAT))
            wf.setframerate(RATE)
            wf.writeframes(b''.join(frames))

        return(audio_process(OUTPUT_FILENAME))
       




In [13]:
def openNotes(fileName):
  with open(fileName, 'r') as file:
    lines = file.readlines()

  processed_lines = []
  for line in lines:
      stripped_line = line.strip()  # Remove leading/trailing whitespace
      if stripped_line:
          # If the line is not empty, check if it's the start of a new section
          if stripped_line.endswith(":"):  # Assuming sections might end with a colon
              processed_lines.append("\n\n" + stripped_line)
          else:
              processed_lines.append(stripped_line)
      else:
          # Insert a double newline for paragraph breaks or section ends
          processed_lines.append("\n\n")

  # Join lines, ensuring that excess newlines are handled
  notes = ' '.join(processed_lines).replace('\n\n ', '\n\n').replace('  ', ' ')

  # Optional: Further cleaning up and normalizing spacing
  return (notes.strip().replace('\n\n', '[SECTION]').replace('\n', ' ').replace('[SECTION]', '\n\n'))

In [16]:
record_audio()

Recording started. Press Ctrl+C to stop.

Recording stopped by user.


TypeError: audio_process() missing 2 required positional arguments: 'model' and 'processor'

In [14]:
notes = openNotes("notes.txt")
content = (audio_process("output.wav", processor = processor, model = model, device = device))    
#print(audio_process("output.wav", processor = processor, model = model, device = device))
#content = 
ollamaFunction("Instructions: 'Create realistic notes from the following conversation. The notes should summarize the key points discussed, include any decisions made, and highlight action items or follow-ups. The notes should be concise but comprehensive, capturing the essence of the dialogue. Please keep this context in mind as you process each 30-second segment of the conversation.' here is the voice notes: '" + content + "' Here are my notes from the meeting: '" + notes + "' Can you combine the notes with the voicechat but keep the notes more relevant and more of a prescence, and also make it into bullet points if it makes scence.")

Here are the combined notes in bullet point form:

**Key Discussion Points:**

* **GUI Upgrade**: The client wants to upgrade the website's GUI (Graphical User Interface) to something nicer.
* **Shop Integration**: They'd like to integrate a shop into the website, with an understanding that extra work will be required for this feature.

**Decisions Made:**

* **Pricing**: A budget of $4,000 was agreed upon for the upgrades and shop integration.
* **Theme Customization**: The client expressed love for the current blue theme, but wants to add some green accents. This will require discussion with designers.

**Action Items/ Follow-ups:**

* **Design Meeting**: Schedule a meeting with designers to discuss adding green accents to the website's design.
* **Pricing Discussion**: Further discussion on pricing is needed to ensure that the client understands the value of the work being done.
* **Meeting Scheduled**: A meeting has been scheduled for tomorrow at 2 o'clock to further discuss the pr