In [1]:
import os
import ffmpeg
import whisper
from docx import Document


In [2]:
def extract_audio(video_path, output_audio_path):
    """
    Extracts audio from a video file.
    """
    ffmpeg.input(video_path).output(output_audio_path, ac=1, ar='16k').run(overwrite_output=True)


In [3]:
def transcribe_audio(audio_path, model_name="base"):
    """
    Transcribes audio using Whisper.
    """
    model = whisper.load_model(model_name)
    result = model.transcribe(audio_path)
    return result['text']


In [4]:
def save_to_text(transcript, output_path):
    """
    Saves the transcript to a text file.
    """
    with open(output_path, 'w') as file:
        file.write(transcript)

def save_to_word(transcript, output_path):
    """
    Saves the transcript to a Word file.
    """
    doc = Document()
    doc.add_paragraph(transcript)
    doc.save(output_path)


In [5]:
def process_videos(input_folder, output_folder, model_name="base"):
    """
    Processes all videos in a folder and saves the transcripts.
    """
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(input_folder):
        if file_name.endswith(('.mp4', '.avi', '.mkv')):  # Add other video formats as needed
            video_path = os.path.join(input_folder, file_name)
            audio_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.wav")
            text_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.txt")
            word_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.docx")

            # Extract audio
            print(f"Extracting audio from {file_name}...")
            extract_audio(video_path, audio_path)
            
            # Transcribe audio
            print(f"Transcribing {file_name}...")
            transcript = transcribe_audio(audio_path, model_name=model_name)
            
            # Save transcripts
            print(f"Saving transcript for {file_name}...")
            save_to_text(transcript, text_path)
            save_to_word(transcript, word_path)


In [7]:
input_folder = r"F:\Jupyter_Notebook\NLP_Internshala\Getting Started with NLP (1)\Getting Started with NLP"  # Replace with the path to your video files
output_folder = r"F:\Jupyter_Notebook\NLP_Internshala\Getting Started with NLP (1)\Getting Started with NLP\Transcripts"  # Replace with the path to save transcripts

process_videos(input_folder, output_folder, model_name="base")


Extracting audio from M1_T1_V1_What is Natural Language Processing (NLP)_1-1.mp4...
Transcribing M1_T1_V1_What is Natural Language Processing (NLP)_1-1.mp4...


100%|███████████████████████████████████████| 139M/139M [00:48<00:00, 3.01MiB/s]


Saving transcript for M1_T1_V1_What is Natural Language Processing (NLP)_1-1.mp4...
Extracting audio from M1_T1_V2_Applications of Natural Language Processing (NLP)_1-2.mp4...
Transcribing M1_T1_V2_Applications of Natural Language Processing (NLP)_1-2.mp4...
Saving transcript for M1_T1_V2_Applications of Natural Language Processing (NLP)_1-2.mp4...
Extracting audio from M1_T3_V1_Text Processing_1-3.mp4...
Transcribing M1_T3_V1_Text Processing_1-3.mp4...
Saving transcript for M1_T3_V1_Text Processing_1-3.mp4...
Extracting audio from M1_T3_V2_Reading Text Data_1-4.mp4...
Transcribing M1_T3_V2_Reading Text Data_1-4.mp4...
Saving transcript for M1_T3_V2_Reading Text Data_1-4.mp4...
Extracting audio from M1_T3_V3_Reading Text Data 2_1-5.mp4...
Transcribing M1_T3_V3_Reading Text Data 2_1-5.mp4...
Saving transcript for M1_T3_V3_Reading Text Data 2_1-5.mp4...
Extracting audio from M1_T4_V1_What are Regular Expressions__1-6.mp4...
Transcribing M1_T4_V1_What are Regular Expressions__1-6.mp4...
S

In [8]:
import os
from pathlib import Path
import whisper

def process_videos_recursive(base_folder, model_name="base"):
    """
    Recursively process video files in subfolders and save transcripts in a 'Transcripts' folder.
    
    :param base_folder: The path to the base folder containing subfolders with videos.
    :param model_name: Whisper model name to use for transcription (e.g., "base", "small", "medium").
    """
    # Load the Whisper model
    model = whisper.load_model(model_name)
    
    # Walk through the base folder recursively
    for root, dirs, files in os.walk(base_folder):
        # Find video files in the current folder
        video_files = [f for f in files if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.flv'))]
        
        if video_files:
            # Create a 'Transcripts' folder in the current subfolder
            transcript_folder = Path(root) / "Transcripts"
            transcript_folder.mkdir(exist_ok=True)
            
            for video_file in video_files:
                video_path = Path(root) / video_file
                transcript_path = transcript_folder / f"{video_file}.txt"
                
                # Skip transcription if the transcript already exists
                if transcript_path.exists():
                    print(f"Skipping {video_path} (transcript already exists).")
                    continue
                
                print(f"Processing {video_path}...")
                try:
                    # Transcribe the video
                    result = model.transcribe(str(video_path))
                    
                    # Save the transcript
                    with open(transcript_path, "w", encoding="utf-8") as f:
                        f.write(result["text"])
                    
                    print(f"Transcript saved: {transcript_path}")
                except Exception as e:
                    print(f"Error processing {video_path}: {e}")

# Base folder containing your subfolders with videos
base_folder = r"F:\Jupyter_Notebook\NLP_Internshala\classVideos"
process_videos_recursive(base_folder, model_name="base")


Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Feature Engineering for Text Data\Feature Engineering for Text Data\M5_T1_V1_Introduction to Feature Engineering for Text Data_1-58.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Feature Engineering for Text Data\Feature Engineering for Text Data\Transcripts\M5_T1_V1_Introduction to Feature Engineering for Text Data_1-58.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Feature Engineering for Text Data\Feature Engineering for Text Data\M5_T2_V1_Text Feature Engineering Techniques_1-59.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Feature Engineering for Text Data\Feature Engineering for Text Data\Transcripts\M5_T2_V1_Text Feature Engineering Techniques_1-59.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Feature Engineering for Text Data\Feature Engineering for Text Data\M5_T2_V2_Text Feature Engineering Implementation_1-60.mp4...
Transcri

Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Information Retrieval and Ranked Retrieval\Information Retrieval and Ranked Retrieval\Transcripts\M3_T7_V2_Loading the Dataset and Retrieving Documents using Jaccard Coefficient_1-43.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Information Retrieval and Ranked Retrieval\Information Retrieval and Ranked Retrieval\M3_T7_V3_Ranked Retrieval using Term Frequency (TF) and Inverse Document Frequency (IDF)_1-44.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Information Retrieval and Ranked Retrieval\Information Retrieval and Ranked Retrieval\Transcripts\M3_T7_V3_Ranked Retrieval using Term Frequency (TF) and Inverse Document Frequency (IDF)_1-44.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Information Retrieval and Ranked Retrieval\Information Retrieval and Ranked Retrieval\M3_T7_V4_Ranked Retrieval using TF-IDF and vector space model_1-45.mp4...
Transcript s

Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text classification\Text classification\Transcripts\M6_T6_V1_Auto tagging stack exchange questions Part 1_1-10.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text classification\Text classification\M6_T6_V2_Auto tagging stack exchange questions Part 2_1-11.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text classification\Text classification\Transcripts\M6_T6_V2_Auto tagging stack exchange questions Part 2_1-11.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\M2_T10_V1_Relation Extraction_1-12.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\Transcripts\M2_T10_V1_Relation Extraction_1-12.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVid

Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\Transcripts\M2_T9_V1_Named Entity Recognition (NER)_1-30.mp4.txt
Processing F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\M2_T9_V2_NER Implementation_1-31.mp4...
Transcript saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\Transcripts\M2_T9_V2_NER Implementation_1-31.mp4.txt


In [15]:
import os
from pathlib import Path
from docx import Document

def merge_texts_to_word(input_folder, output_file):
    """
    Merge all text files in a folder into a Word document.
    
    :param input_folder: Path to the folder containing text files.
    :param output_file: Path to the output Word document.
    """
    # Create a new Word document
    document = Document()
    
    # Iterate over all text files in the input folder
    for text_file in sorted(Path(input_folder).glob("*.txt")):
        # Use the file name (without extension) as the title
        title = text_file.stem
        
        # Add the title to the Word document
        document.add_heading(title, level=1)
        
        # Read the content of the text file
        with open(text_file, "r", encoding="utf-8") as f:
            content = f.read()
        
        # Add the content to the Word document
        document.add_paragraph(content)
        document.add_paragraph("\n")  # Add a blank line for separation
    
    # Save the Word document
    document.save(output_file)
    print(f"Word document saved: {output_file}")

# Folder containing text files
input_folder = r"F:\Jupyter_Notebook\NLP_Internshala\classVideos\Getting Started with NLP (1)\Getting Started with NLP\Transcripts"

# Path for the output Word document
output_file = r"F:\Jupyter_Notebook\NLP_Internshala\classVideos\Getting Started with NLP.docx"

merge_texts_to_word(input_folder, output_file)


Word document saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Getting Started with NLP.docx


In [10]:
import os
from pathlib import Path
from docx import Document

def merge_texts_with_structure(input_folder):
    """
    Merge all text files in a folder into a Word document with chapters and subsections.
    
    :param input_folder: Path to the folder containing text files.
    """
    # Get the name of the folder to use as the output filename
    folder_name = Path(input_folder).name
   # output_file = Path(input_folder).parent / f"{folder_name}.docx"
    # Path for the output Word document
    output_file = r"F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction.docx"

    # Create a new Word document
    document = Document()
    
    # Dictionary to group files by chapters
    chapters = {}

    # Iterate over all text files in the input folder
    for text_file in sorted(Path(input_folder).glob("*.txt")):
        # Extract parts from the filename
        filename = text_file.stem
        parts = filename.split("_")
        
        # Parse chapter and subsection
        chapter = f"Chapter {parts[0][1:]}" if parts[0].startswith("M") else "Unknown Chapter"
        subsection = parts[-1] if len(parts) > 1 else "Unnamed Subsection"
        
        # Group by chapter
        if chapter not in chapters:
            chapters[chapter] = []
        chapters[chapter].append((subsection, text_file))
    
    # Add chapters and subsections to the Word document
    for chapter, subsections in sorted(chapters.items()):
        # Add chapter heading
        document.add_heading(chapter, level=1)
        
        for subsection, text_file in sorted(subsections):
            # Add subsection heading
            document.add_heading(subsection, level=2)
            
            # Read and add content of the text file
            with open(text_file, "r", encoding="utf-8") as f:
                content = f.read()
            document.add_paragraph(content)
            document.add_paragraph("\n")  # Add a blank line for separation
    
    # Save the Word document
    document.save(output_file)
    print(f"Word document saved: {output_file}")

# Folder containing text files
input_folder = r"F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction\Text Pre-processing and Information Extraction\Transcripts"

merge_texts_with_structure(input_folder)


Word document saved: F:\Jupyter_Notebook\NLP_Internshala\classVideos\Text Pre-processing and Information Extraction.docx
