In [None]:
pip install pydub moviepy speechrecognition googletrans==4.0.0-rc1 transformers torch nltk


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting speechrecognition
  Downloading SpeechRecognition-3.10.4-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.7.1-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading charde

In [None]:
import os
import moviepy.editor as mp
import warnings
from pydub import AudioSegment
from pydub.silence import split_on_silence
import speech_recognition as sr
from googletrans import Translator
import nltk
from transformers import pipeline

In [None]:
# Download nltk data
nltk.download('punkt')

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='transformers.pipelines.token_classification')
warnings.filterwarnings("ignore", category=UserWarning, module='transformers.modeling_utils')

# Functions for the features

def video_to_audio(in_path, out_path):
    """Convert video file to audio file"""
    video = mp.VideoFileClip(in_path)
    video.audio.write_audiofile(out_path)

def large_audio_to_text(path):
    """Split audio into chunks and apply speech recognition"""
    sound = AudioSegment.from_wav(path)
    chunks = split_on_silence(sound, min_silence_len=700, silence_thresh=sound.dBFS-14, keep_silence=700)
    folder_name = "audio-chunks"
    if not os.path.isdir(folder_name):
        os.mkdir(folder_name)

    whole_text = ""
    timestamps = []

    for i, audio_chunk in enumerate(chunks, start=1):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        audio_chunk.export(chunk_filename, format="wav")
        with sr.AudioFile(chunk_filename) as source:
            try:
                audio_listened = r.record(source)
                text = r.recognize_google(audio_listened)
                start_time = i * len(audio_chunk) / 1000  # Simple approximation for timestamp
                timestamps.append((text, start_time))
                text = f"{text.capitalize()}. "
                whole_text += text
            except sr.UnknownValueError:
                print(f"Could not understand audio in chunk {i}. Skipping.")
            except sr.RequestError as e:
                print(f"Could not request results for chunk {i}: {e}")
            except Exception as e:
                print(f"Error processing chunk {i}: {e}")

    if not timestamps:
        print("No valid audio chunks found.")

    return whole_text, timestamps

def translate_text(text, lang):
    """Translate text to a given language"""
    translator = Translator()
    try:
        translated_text = translator.translate(text, dest=lang).text
        return translated_text
    except Exception as e:
        print(f"Error translating text to {LANGUAGES.get(lang, 'unknown language')}: {e}")
        return text

def restore_punctuation(text):
    """Restore punctuation in text"""
    sentences = nltk.tokenize.sent_tokenize(text)
    punctuated_text = ' '.join(sentences)
    return punctuated_text

def summarize_text(text):
    """Summarize text"""
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    max_input_length = summarizer.tokenizer.model_max_length
    truncated_text = summarizer.tokenizer.decode(summarizer.tokenizer.encode(text, truncation=True, max_length=max_input_length))
    summary = summarizer(truncated_text, max_length=130, min_length=30, do_sample=False)
    return summary[0]['summary_text']

def named_entity_recognition(text):
    """Perform Named Entity Recognition (NER)"""
    ner = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")
    entities = ner(text)
    # Filter out low-confidence entities
    entities = [entity for entity in entities if entity['score'] > 0.5]
    return entities

# Main script
r = sr.Recognizer()

# Upload the video file
from google.colab import files
uploaded = files.upload()

video_path = list(uploaded.keys())[0]

# Convert video to audio
audio_path = 'sample_audio.wav'
video_to_audio(video_path, audio_path)

# Convert audio to text
text, timestamps = large_audio_to_text(audio_path)

# Punctuation restoration
text = restore_punctuation(text)

# Translation
language = input("Choose a language (fr for French, es for Spanish, ar for Arabic): ")
translated_text = translate_text(text, language)

# Summarization
summary = summarize_text(translated_text)

# Named Entity Recognition
entities = named_entity_recognition(translated_text)

# Format NER output
formatted_entities = [
    {
        'Entity': entity['word'],
        'Entity Group': entity['entity_group'],
        'Confidence Score': round(entity['score'], 4),
        'Start Position': entity['start'],
        'End Position': entity['end']
    }
    for entity in entities
]

# Print results
print("Original Text:", text)
print("Translated Text:", translated_text)
print("Summary:", summary)
print("Named Entities:")
for entity in formatted_entities:
    print(f"Entity: {entity['Entity']}, Group: {entity['Entity Group']}, "
          f"Confidence: {entity['Confidence Score']}, Start: {entity['Start Position']}, "
          f"End: {entity['End Position']}")

# Save results to file
with open('result.txt', 'w') as f:
    f.write(f"Original Text:\n{text}\n\n")
    f.write(f"Translated Text ({language}):\n{translated_text}\n\n")
    f.write(f"Summary:\n{summary}\n\n")
    f.write("Named Entities:\n")
    for entity in formatted_entities:
        f.write(f"Entity: {entity['Entity']}, Group: {entity['Entity Group']}, "
                f"Confidence: {entity['Confidence Score']}, Start: {entity['Start Position']}, "
                f"End: {entity['End Position']}\n")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Saving A one minute TEDx Talk for the digital age _ Woody Roseland _ TEDxMileHigh.mp4 to A one minute TEDx Talk for the digital age _ Woody Roseland _ TEDxMileHigh.mp4
MoviePy - Writing audio in sample_audio.wav




MoviePy - Done.
Could not understand audio in chunk 1. Skipping.
Could not understand audio in chunk 7. Skipping.
Could not understand audio in chunk 13. Skipping.
Could not understand audio in chunk 20. Skipping.
Choose a language (fr for French, es for Spanish, ar for Arabic): es


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Original Text: Wow. What an audience. But if i'm being honest. I don't care what you think of my talk. I don't. Cuz they're the ones who get it seen and get it shared and i think that's where most people get it wrong they're talking to you here. Set of talking to you random person. Scrolling facebook. Thanks for the click. You see back in 2009 we all had these weird little things called attention spans. I'm trying to think of the last time i watched an 18 minute ted talk. It's been years literally years. So if you're giving a ted talk keep it quick i'm doing mine and under a minute. I'm at 44 seconds right now that means we got time for one final joke. Why are balloons so expensive. Inflation.
Translated Text: Guau.Qué audiencia.Pero si estoy siendo honesto.No me importa lo que pienses de mi charla.No.Porque, son los que lo vean y lo comparten y creo que ahí es donde la mayoría de las personas se equivocan, están hablando contigo aquí.Conjunto de hablar con tu persona al azar.Desplazan