In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys
import os
sys.path.insert(1, os.path.abspath('../py'))
from nt_utils import TranscriptionUtilities

tu = TranscriptionUtilities(
    data_folder_path=os.path.abspath('../data'),
    saves_folder_path=os.path.abspath('../saves')
)


----
## Here's a manual transcription of the first few paragraphs of "The Coming Thresholds"

In [3]:

file_path = os.path.join(tu.saves_text_folder, 'AI_The_Coming_Thresholds_and_The_Path_We_Must_Take_Internationally_Acclaimed_Cognitive_Scientist.txt')
with open(file_path, 'r', encoding='utf-8') as f:
    essay_str = '\n\n'.join(f.read().split('\n\n')[1:3])
print(essay_str)

Welcome everyone! This is not a "Voices with Vervaeke", this is a new entity I'm calling a video essay. But (under good advice from the two gentlemen that are joining me) it was proposed to me, and I accept the proposal, that this should have a little bit more of a dialogical structure to it. And, given the value of dialogue as I've been explaining it in other work, I took this deeply to heart.

I am going to present, still, an essay. And let's remember what Montaigne meant by *essai*: "to try". I'm going to "try" with the help of these two gentlemen to bring some clarity to the issue around GPT machines the advent of what looks like this for "sparks" of Artificial General Intelligence.



## Let's try taking the YouTube transcription and cleaning it up with an offline LLM

In [4]:

# Get the transcription for the video
tu.ensure_module_installed('youtube-transcript-api', upgrade=True, verbose=False)
from youtube_transcript_api import YouTubeTranscriptApi

video_id = 'A-_RdKiDbz4'
transcript_dicts_list = YouTubeTranscriptApi.get_transcript(video_id)

In [5]:

# Prepare the prompt
tu.ensure_module_installed('gpt4all', verbose=False)
import gpt4all

file_path = os.path.join(tu.saves_text_folder, 'break_this_up.txt')
with open(file_path, 'r', encoding='utf-8') as f:
    prompt_str = f.read()
messages = [{'role': 'user', 'content': prompt_str + '\n\n' + ' '.join([transcript_dict['text'] for transcript_dict in transcript_dicts_list][:20]).lower()}]
messages

[{'role': 'user', 'content': "Could you break up this wall of text (using capitalization, punctuation, and whitespace) into sentences and paragraphs so that it is easier to read? Do this without paraphrasing the text. Only remove filler words, discourse markers, repetitions, and other thinking noises.\n\nwelcome everyone this is not a voices with friviki this is a new entity i'm calling a video essay but under good advice from the two gentlemen that are joining me it was proposed to me and i accept the proposal that this should have a little bit more of a dialogical uh structure to it and given the value of dialogue as i've been explaining it in other work i took this deeply to heart i am going to present still an essay and let's remember what montana meant by essay saa to try i'm going to try with the help of these two gentlemen to bring some clarity to the issue around gpt machines the advent of what looks like this for sparks of artificial general intelligence"}]

In [6]:

# Get the best offline model
if tu.pickle_exists('models_df'):
    models_df = tu.load_object('models_df', verbose=False)
    mask_series = (models_df.usage_completion_tokens > 0)
    best_model = models_df[mask_series].sort_values('similarity', ascending=False).head(1).model_name.squeeze()
else:
    best_model = 'ggml-wizardLM-7B.q4_2.bin'
print(f'The best offline model is {best_model}')

The best offline model is ggml-wizardLM-7B.q4_2.bin


In [7]:

# Get the model's response
tu.ensure_module_installed('humanize', verbose=False)
import time
import humanize

t1 = time.time()
model_obj = gpt4all.GPT4All(best_model, model_path=tu.data_models_folder)

# n_predict is the number of tokens to predict. It defaults to 128 for this LLModel.
response_dict = model_obj.chat_completion(messages, verbose=False, n_predict=256)

print(response_dict['choices'][0]['message']['content'])
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'{best_model} responded in {duration_str}')

Found model file.
Sure, I can help with that. Here's the text broken up into sentences and paragraphs:

Welcome everyone, this is not a Voices with Friviki. This is a new entity I'm calling a video essay. Under good advice from the two gentlemen that are joining me, it was proposed to me, and I accept the proposal that this should have a little bit more of a dialogical uh structure to it. Given the value of dialogue as I've been explaining it in other work, I took this deeply to heart. I am going to present still an essay, and let's remember what Montana meant by essay saa to try. I'm going to try with the help of these two gentlemen to bring some clarity to the issue around GPT machines, the advent of what looks like this for sparks of artificial general intelligence.
ggml-wizardLM-7B.q4_2.bin responded in 30 minutes and 37 seconds


In [11]:

tu.similar(essay_str, response_dict['choices'][0]['message']['content'])

0.771978021978022


----
## Let's try taking the YouTube transcription and cleaning it up with an online LLM

In [13]:

tu.ensure_module_installed('langchain', verbose=False)
t1 = time.time()
from langchain.llms import OpenAI

llm = OpenAI(temperature=0.9)
openai_response_str = llm(messages[0]['content'])
print(openai_response_str)
duration_str = humanize.precisedelta(time.time() - t1, minimum_unit='seconds', format='%0.0f')
print(f'OpenAI responded in {duration_str}')



Welcome everyone! This is not Voices with Friviki, this is a new entity I'm calling a video essay. Under good advice from the two gentlemen joining me, it was proposed to me and I accepted the proposal that this should have a little bit more of a dialogical structure. Given the value of dialogue as I've been explaining it in other work, I took this deeply to heart. 

I am going to present still an essay and, let's remember what Montana meant by "essay": to try. With the help of these two gentlemen, I'm going to try to bring some clarity to the issue around GPT machines and the advent of what looks like sparks of artificial general intelligence.
OpenAI responded in 7 seconds



That's almost twenty times faster!

In [14]:

tu.similar(essay_str, openai_response_str)

0.4966641957005189


----
## Let's try downloading the audio as an mp3 and text-to-speeching it ourselves

In [26]:

new_file = 'AI_The_Coming_Thresholds_and_The_Path_We_Must_Take_Internationally_Acclaimed_Cognitive_Scientist.mp3'
new_path = os.path.join(tu.saves_mp3_folder, new_file)
if not os.path.exists(new_path):
    tu.ensure_module_installed('pytube', upgrade=True, verbose=False)
    from pytube import YouTube
    import re

    video_url = f'https://www.youtube.com/watch?v={video_id}'
    yt = YouTube(video_url)

    # Extract audio with 160kbps quality from video
    video = yt.streams.filter(abr='160kbps').last()

    # Download the file
    out_path = video.download(output_path=tu.saves_mp3_folder)
    os.rename(out_path, new_path)

In [None]:

tu.ensure_module_installed('pydub', upgrade=True, verbose=False)
from pydub import AudioSegment

# Files
dst_path = os.path.join(tu.saves_wav_folder, os.path.splitext(new_file)[0] + '.wav')
dst_path

In [None]:

# Convert mp3 to wav
sound = AudioSegment.from_mp3(new_path)

In [None]:

# Convert mp3 to wav
sound.export(dst_path, format='wav')

In [None]:

# Import library
tu.ensure_module_installed('SpeechRecognition', upgrade=True, verbose=False)
tu.ensure_module_installed('pyaudio', upgrade=True, verbose=False)
tu.ensure_module_installed('pyttsx3', upgrade=True, verbose=False)
import speech_recognition as sr

# Initialize recognizer class (for recognizing the speech)
r = sr.Recognizer()

In [None]:

# Reading Audio file as source
# listening the audio file and store in audio_text variable
with sr.AudioFile(new_path) as source:
    audio_text = r.listen(source, timeout=60, phrase_time_limit=60)
    text = r.recognize_google(audio_text)
print(len(text))

In [None]:

# Importing libraries
import tempfile
from pydub import AudioSegment
from pydub.silence import split_on_silence

# Initialize recognizer class (for recognizing the speech)
r = sr.Recognizer()

# Recognize speech in the audio file so that we
# don't repeat ourselves in other functions
def transcribe_audio(path):
    
    # Use the audio file as the audio source
    # Listen to the audio file and store it in the audio_text variable
    with sr.AudioFile(path) as source:
        audio_listened = r.record(source)
        
        # Attempt to convert it to text
        text = r.recognize_google(audio_listened)
    
    return text

# Split the audio file into chunks on
# silence and apply speech recognition
def get_large_audio_transcription_on_silence(path, verbose=True):
    '''Splitting the large audio file into chunks
    and apply speech recognition on each of these chunks'''
    whole_texts_list = []
    
    # Open the audio file using pydub
    sound = AudioSegment.from_file(path) 
    
    # Split audio sound where silence is 500 miliseconds or more and get chunks
    chunks = split_on_silence(
        sound,
        min_silence_len = 500, # experiment with this value for your target audio file
        silence_thresh = sound.dBFS-14, # adjust this per requirement
        keep_silence=500, # keep the silence for 1 second, adjustable as well
    )
    
    # Create a directory to store the audio chunks
    with tempfile.TemporaryDirectory() as tmpdirname:
        if verbose: print('created temporary directory', tmpdirname)
        if not os.path.isdir(tmpdirname):
            os.mkdir(tmpdirname)

        # Process each chunk
        for i, audio_chunk in enumerate(chunks, start=1):

            # Export audio chunk and save it in
            # the temporary directory.
            chunk_filename = os.path.join(tmpdirname, f'chunk{i}.wav')
            audio_chunk.export(chunk_filename, format='wav')

            # Recognize the chunk
            try:
                text = transcribe_audio(chunk_filename)
            except sr.UnknownValueError as e:
                print('Error:', str(e))
            except Exception as e:
                print(f'{e.__class__.__name__} error: {str(e).strip()}')
            else:
                whole_texts_list.append(text)
                if verbose: print(chunk_filename, ':', f'{text.capitalize()}. ')
    
    # Return the text for all chunks detected
    return whole_texts_list

In [None]:

whole_text = ' '.join(get_large_audio_transcription_on_silence(dst_path, verbose=False))


----

In [None]:

tu.ensure_module_installed('pathlib', verbose=False)
tu.ensure_module_installed('langchain', verbose=False)
tu.ensure_module_installed('google-api-python-client', verbose=False)
tu.ensure_module_installed('google-auth-httplib2', verbose=False)
tu.ensure_module_installed('google-auth-oauthlib', verbose=False)
tu.ensure_module_installed('youtube_transcript_api', verbose=False)
from pathlib import Path
from langchain.document_loaders import GoogleApiClient
from langchain.document_loaders import GoogleApiYoutubeLoader

google_api_client = GoogleApiClient(
    service_account_path=Path('../data/secrets/google_client_secrets.json')
)

In [None]:

loader = GoogleApiYoutubeLoader(
    google_api_client=google_api_client,
    video_ids=[video_id]
)
documents_list = loader.load()

In [None]:

Document_obj = documents_list[0]
dir(Document_obj)

In [None]:

Document_obj.page_content[:671]

In [None]:

tu.ensure_module_installed('youtube_dl', upgrade=True, verbose=False)
from youtube_dl import YoutubeDL

audio_downloder = YoutubeDL({'format':'bestaudio'})
audio_downloder.extract_info(video_url)

In [None]:

tu.ensure_module_installed('youtube_dl', upgrade=True, verbose=False)
import youtube_dl

video_info = youtube_dl.YoutubeDL().extract_info(
    url=video_url,
    download=False
)
title = video_info['title']
filename = f'{title}.mp3'
options={
    'format':'bestaudio/best',
    'keepvideo':False,
    'outtmpl':filename,
}
with youtube_dl.YoutubeDL(options) as ydl:
    ydl.download([video_info['webpage_url']])
print('Download complete... {}'.format(filename))


----

In [20]:

mask_series = (models_df.model_name == best_model)
for row_index, row_series in models_df[mask_series].T.dropna().items():
    for column_name, column_value in row_series.items():
        print(column_name, column_value, sep=': ')

model_name: ggml-wizardLM-7B.q4_2.bin
md5sum: 99e6d129745a3f1fb1121abed747b05a
filesize: 4212864640
description: A non-commercially licensable model based on Llama 7b and trained by Microsoft and Peking University.
model: ggml-wizardLM-7B.q4_2
usage_prompt_tokens: 1171.0
usage_completion_tokens: 514.0
usage_total_tokens: 1685.0
choices_message_role: assistant
choices_message_content: Sure, I can help with that. Here's the text broken up into sentences and paragraphs:

Welcome everyone, this is not a Voices with Friviki. This is a new entity I'm calling a video essay. Under good advice from the two gentlemen that are joining me, it was proposed to me, and I accept the proposal that this should have a little bit more of a dialogical uh structure to it. Given the value of dialogue as I've been explaining it in other work, I took this deeply to heart. I am going to present still an essay, and
similarity: 0.6071133167907361
