In [9]:
token = "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"

In [10]:
import cv2
import os

def save_video_frames(video_path, output_folder):
    # Load the video
    cap = cv2.VideoCapture(video_path)
    
    # Check if video loaded successfully
    if not cap.isOpened():
        print("Error: Could not open video.")
        return
    
    # Create the output folder if it does not exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Get total frame count
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"Total frames: {total_frames}")

    frame_index = 0
    
    # Loop through all frames
    while True:
        ret, frame = cap.read()
        
        # Break the loop if no frame is returned (end of video)
        if not ret:
            break

        # Save frame as an image
        frame_filename = os.path.join(output_folder, f"frame_{frame_index:04d}.jpg")
        cv2.imwrite(frame_filename, frame)
        
        print(f"Saved {frame_filename}")
        
        frame_index += 1

    # Release the video capture object
    cap.release()
    print("All frames saved successfully.")

# Usage
video_path = "test_video1.mp4"  # Path to the input video
output_folder = "output_frames"  # Folder to save frames
# save_video_frames(video_path, output_folder)


In [11]:
import cv2

def calculate_pixel_similarity(img1_path, img2_path):
    # Load the images in grayscale
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)
    
    # Check if images loaded correctly
    if img1 is None or img2 is None:
        print("Error: One or both images could not be loaded.")
        return None

    # Ensure the images are the same size
    if img1.shape != img2.shape:
        print("Error: Images must have the same dimensions.")
        return None

    # Calculate pixel-wise difference
    similarity = cv2.norm(img1, img2, cv2.NORM_L2) / (img1.size)
    similarity_score = 1 - similarity  # Normalize similarity to 0-1 scale

    return similarity

# Usage example
img1_path = r"output_frames/frame_0619.jpg"
img2_path = r"output_frames/frame_0621.jpg"
similarity_score = calculate_image_similarity(img1_path, img2_path)
print(f"Similarity score: {similarity_score:.4f}")


NameError: name 'calculate_image_similarity' is not defined

In [24]:
import cv2
from skimage.metrics import structural_similarity as ssim

def calculate_ssim_similarity(img1, img2):

    # Check if images loaded correctly
    if img1 is None or img2 is None:
        print("Error: One or both images could not be loaded.")
        return None

    # Ensure the images are the same size
    if img1.shape != img2.shape:
        print("Error: Images must have the same dimensions.")
        return None

    # Calculate SSIM between the two images
    similarity_score, _ = ssim(img1, img2, full=True)

    return similarity_score

# Usage example
img1_path = r"output_frames/frame_0619.jpg"
img2_path = r"output_frames/frame_0621.jpg"
similarity_score = calculate_ssim_similarity(img1_path, img2_path)
print(f"SSIM Similarity score: {similarity_score:.4f}")


SSIM Similarity score: 0.9758


In [27]:
import cv2

def calculate_histogram_similarity(img1_path, img2_path):
    # Load the images in color
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)
    
    # Check if images loaded correctly
    if img1 is None or img2 is None:
        print("Error: One or both images could not be loaded.")
        return None

    # Resize images to the same size if they differ
    if img1.shape != img2.shape:
        img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))

    # Convert images to HSV color space
    img1_hsv = cv2.cvtColor(img1, cv2.COLOR_BGR2HSV)
    img2_hsv = cv2.cvtColor(img2, cv2.COLOR_BGR2HSV)

    # Calculate histograms and normalize them
    hist_img1 = cv2.calcHist([img1_hsv], [0, 1], None, [50, 60], [0, 180, 0, 256])
    hist_img2 = cv2.calcHist([img2_hsv], [0, 1], None, [50, 60], [0, 180, 0, 256])
    cv2.normalize(hist_img1, hist_img1, 0, 1, cv2.NORM_MINMAX)
    cv2.normalize(hist_img2, hist_img2, 0, 1, cv2.NORM_MINMAX)

    # Compare histograms using correlation
    similarity_score = cv2.compareHist(hist_img1, hist_img2, cv2.HISTCMP_CORREL)

    return similarity_score

img1_path = r"output_frames/frame_0620.jpg"
img2_path = r"output_frames/frame_0067.jpg"
similarity_score = calculate_histogram_similarity(img1_path, img2_path)
print(f"Histogram Similarity score: {similarity_score:.4f}")


Histogram Similarity score: -0.0008


In [3]:
import cv2

def extract_video_frames(video_path, frame_interval=1):
    """
    Extracts frames from a video at the specified interval.

    Parameters:
    - video_path (str): Path to the video file.
    - frame_interval (int): Interval at which frames are saved (e.g., 1 = every frame, 2 = every other frame).
    
    Returns:
    - frames (list): List of frames extracted from the video.
    """
    # Load the video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return []

    # Retrieve frame rate to process frames efficiently
    frame_rate = cap.get(cv2.CAP_PROP_FPS)
    print(f"Frame rate: {frame_rate} FPS")

    frames = []
    frame_index = 0

    while True:
        ret, frame = cap.read()

        # Break the loop if there are no frames left to read
        if not ret:
            break

        # Process only every `frame_interval` frame
        if frame_index % frame_interval == 0:
            frames.append(frame)

        frame_index += 1

    cap.release()
    print(f"Total frames extracted: {len(frames)}")

    return frames

# Usage example
video_path = "test_video1.mp4"  # Path to your video file
frame_interval = 1  # Extract every second frame
frames = extract_video_frames(video_path, frame_interval)
print(f"Extracted {len(frames)} frames.")


Frame rate: 60.0 FPS
Total frames extracted: 2581
Extracted 2581 frames.


In [None]:
# import cv2
# import numpy as np

# from skimage.metrics import structural_similarity as ssim

# def find_stable_frames(frames, frame_rate, similarity_threshold=0.95, stable_duration=5):
#     """
#     Identify stable frames in a list of video frames.

#     Parameters:
#     - frames (list): List of frames extracted from a video.
#     - frame_rate (float): Frame rate of the video.
#     - similarity_threshold (float): SSIM similarity threshold for consecutive frames to be considered stable.
#     - stable_duration (float): Minimum duration in seconds for frames to be considered stable.

#     Returns:
#     - stable_frames (list): List of tuples with start and end timestamps for stable frames.
#     """
#     # Calculate the required consecutive stable frame count
#     required_stable_frames = int(stable_duration * frame_rate)
#     stable_frames = []
    
#     stable_frame_count = 0
#     start_time = None

#     start_index = None
#     end_index = None

#     # Iterate through consecutive frames and calculate SSIM
#     for i in range(1, len(frames)):
        
#         if i%100==0:
#             print("processing : ",i)
#         # Convert frames to grayscale
#         frame1_gray = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)
#         frame2_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)
        
#         # Calculate SSIM similarity between consecutive frames
#         similarity_score, _ = ssim(frame1_gray, frame2_gray, full=True)
        
#         # Check if the similarity meets the threshold
#         if similarity_score >= similarity_threshold:
#             # Start tracking stable sequence
#             if stable_frame_count == 0:
#                 start_time = (i - 1) / frame_rate  # Start timestamp of stable sequence
#             if not start_index:
#                 start_index = i
#             stable_frame_count += 1
#         else:
#             end_index = i-1
#             # End stable sequence if similarity falls below threshold
#             if stable_frame_count >= required_stable_frames:
#                 end_time = (i - 1) / frame_rate  # End timestamp of stable sequence
#                 if end_index> start_index:
#                     stable_frames.append([frames[np.random.randint(start_index+1,end_index-1)],(start_time, end_time)])
#                 else:
#                     stable_frames.append([frames[start_index],(start_time, end_time)])
#             # Reset stable frame count
#             stable_frame_count = 0
#             start_index = None

#     # Handle case where last sequence is stable
#     if stable_frame_count >= required_stable_frames:
#         end_index = i-1
#         end_time = (len(frames) - 1) / frame_rate
#         stable_frames.append([frames[start_index],(start_time, end_time)])

#     return stable_frames

# # Example Usage
# # Assuming 'frames' is the list of frames extracted from the video
# frame_rate = 60  # Replace with the actual frame rate of the video
# stable_frames = find_stable_frames(frames, frame_rate, similarity_threshold=0.99, stable_duration=5)



In [2]:
import gc
import os
import cv2
import numpy as np
from skimage.metrics import structural_similarity as ssim
from moviepy.editor import VideoFileClip
from faster_whisper import WhisperModel
from utils.helper import create_blog_post
import ffmpeg
import soundfile as sf
import os


class VideoToBlog:

    def __init__(self,video_path,title,frame_interval=1,similarity_threshold=0.99,stable_duration_sec=5,transcription_model="large-v3",device="cpu",compute_type="int8"):
        self.video_path = video_path
        self.title=title
        self.frame_interval = frame_interval
        self.similarity_threshold = similarity_threshold
        self.stable_duration_sec = stable_duration_sec
        os.makedirs("stable_frames",exist_ok=True)
        self.__transcription_model = WhisperModel(transcription_model, device=device, compute_type=compute_type)

    def extract_video_frames(self):
        """
        Extracts frames from a video at the specified interval.

        Parameters:
        - video_path (str): Path to the video file.
        - frame_interval (int): Interval at which frames are saved (e.g., 1 = every frame, 2 = every other frame).
        
        Returns:
        - frames (list): List of frames extracted from the video.
        """
        # Load the video
        cap = cv2.VideoCapture(self.video_path)
        if not cap.isOpened():
            print("Error: Could not open video.")
            return []

        # Retrieve frame rate to process frames efficiently
        frame_rate = cap.get(cv2.CAP_PROP_FPS)
        print(f"Frame rate: {frame_rate} FPS")

        frames = []
        frame_index = 0

        while True:
            ret, frame = cap.read()

            # Break the loop if there are no frames left to read
            if not ret:
                break

            # Process only every `frame_interval` frame
            if frame_index % self.frame_interval == 0:
                frames.append(frame)

            frame_index += 1

        cap.release()
        print(f"Total frames extracted: {len(frames)}")
        return frames, frame_rate

    def find_stable_frames(self,frames, frame_rate):
        """
        Identify stable frames in a list of video frames.

        Parameters:
        - frames (list): List of frames extracted from a video.
        - frame_rate (float): Frame rate of the video.
        - similarity_threshold (float): SSIM similarity threshold for consecutive frames to be considered stable.
        - stable_duration (float): Minimum duration in seconds for frames to be considered stable.

        Returns:
        - stable_frames (list): List of tuples with start and end timestamps for stable frames.
        """
        # Calculate the required consecutive stable frame count
        required_stable_frames = int(self.stable_duration_sec * frame_rate)
        stable_frames = []
        stable_frames_count=0
        start_and_end = []
        start_ind = None
        end_ind = None
        start_time = None
        end_time = None

        current_stable_frames = []

        for i in range(1, len(frames)):

            if i%100==0:

                print(f"processing : {i}/{len(frames)}")

            # Convert frames to grayscale
            frame1_gray = cv2.cvtColor(frames[i - 1], cv2.COLOR_BGR2GRAY)

            frame2_gray = cv2.cvtColor(frames[i], cv2.COLOR_BGR2GRAY)

            similarity_score, _ = ssim(frame1_gray, frame2_gray, full=True)

            if similarity_score>=self.similarity_threshold:

                if not start_time:

                    start_ind = i-1

                    stable_frames_count +=1

                    if i==0:

                        start_time=0.0001

                    else:

                        start_time = i/frame_rate

                    current_stable_frames.append(frames[i])

                else:

                    current_stable_frames.append(frames[i])

            else:

                if len(current_stable_frames)>=required_stable_frames:

                    end_ind = i-1

                    end_time = (i-1)/frame_rate

                    print(f"Stable Frame Starts {start_time} and ends at {end_time}")

                    stable_frames.append(frames[i-1])

                    start_and_end.append((start_time,end_time))

                    cv2.imwrite(f"stable_frames/image_{stable_frames_count}.jpg",frames[i-1])

                else:

                    pass
                    
                    # print("Skipping.. :",len(current_stable_frames),"Required : ",required_stable_frames)

                start_time,end_time, start_ind, end_ind, current_stable_frames = None,None,None,None,[]

        if len(current_stable_frames)>=required_stable_frames:

            end_ind = i=i

            end_time = (i-1)/frame_rate

            stable_frames.append(frames[np.random.randint(start_ind+1,end_ind-1)])

            start_and_end.append((start_time,end_time))

        gc.collect()
        
        return stable_frames, start_and_end
    
    def extract_audio(self,audio_output_path="audio.mp3"):
        """
        Extracts audio from a video file and saves it to the specified output path.

        :param video_path: Path to the input video file.
        :param audio_output_path: Path to save the extracted audio file.
        """
        # Load the video file
        video = VideoFileClip(self.video_path)
        
        # Extract audio
        audio = video.audio
        
        # Write audio to file
        audio.write_audiofile(audio_output_path)
        
        # Close the video file
        video.close()

        print("Audio Extracted Successfully....")

    def split_audio_with_ffmpeg(self,audio_path="audio_chunk.wav", timestamps=[], output_dir="audio_segments"):
        """
        Splits an audio file based on given timestamps using FFmpeg and saves each segment.

        Parameters:
        - audio_path (str): Path to the original audio file.
        - timestamps (list of tuples): List of (start, end) timestamps in seconds for splitting.
        - output_dir (str): Directory to store split audio segments.

        Returns:
        - List of paths to the saved audio segments.
        """
        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        transcriptions = []

        for i, (_, end) in enumerate(timestamps):

            if not len(transcriptions):
                start = 0

            # Define the output path for the segment
            segment_path = os.path.join(output_dir, audio_path)

            if len(timestamps)==i:

                (
                ffmpeg
                .input(audio_path, ss=start,)
                .output(segment_path)
                .run(quiet=True, overwrite_output=True)
                )
            else:
                (
                    ffmpeg
                    .input(audio_path, ss=start, to=end)
                    .output(segment_path)
                    .run(quiet=True, overwrite_output=True)
                )

            text = [i[2] for i in self.transcribe_audio(segment_path)]

            final_text = "\n".join(text) if len(text)>1 else text[0]

            transcriptions.append(final_text)

            start = end

        return transcriptions

            
    def transcribe_audio(self,file_path="audio.mp3"):

        segments, info = self.__transcription_model.transcribe(file_path, beam_size=5, language="en", condition_on_previous_text=False)
        
        transcripton = []

        for segment in segments:
            
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

            transcripton.append((segment.start, segment.end, segment.text))

        return transcripton


    def map_frames_and_transcription(self,start_and_end_frame,transcripton):
        
        final_transcription = []

        starting_list = [i[0] for i in transcripton]
        ending_list = [i[1] for i in transcripton]
        text_list = [i[2] for i in transcripton]

        start = 0

        for _, end in start_and_end_frame:

            # Find closest index in starting_list to start
            closest_start_index = min(enumerate(starting_list), key=lambda x: abs(x[1] - start))[0]

            # Find closest index in ending_list to end
            closest_end_index = min(enumerate(ending_list), key=lambda x: abs(x[1] - end))[0]

            start = ending_list[closest_end_index]

            if not len(final_transcription):

                closest_start_index = 0

            final_transcription.append("\n".join(text_list[closest_start_index:closest_end_index+1]).strip())

        return final_transcription


    def get_frames_and_time(self):

        print("***************** Extracting Frames From the Video *****************************")
        frames,frame_rate = self.extract_video_frames()
        print(f"Extracted {len(frames)} frames.")
        print("***************** Finding The Stable Frames From The Video **********************")
        stable_frames, start_and_end_frame = self.find_stable_frames(frames, frame_rate)
        # print("***************** Transcribing The Audio ****************************")
        self.extract_audio()
        transcripton = self.transcribe_audio()
        final_text = self.map_frames_and_transcription(start_and_end_frame,transcripton)
        # transcripton = self.split_audio_with_ffmpeg("audio.mp3",start_and_end_frame)
        create_blog_post(self.title,stable_frames,final_text)
        return "Blog Created Successfully......."
        # return start_and_end_frame,transcripton
        
video_path = r"samples\videoplayback.mp4"

model = VideoToBlog(video_path,"K-Means Clustering",stable_duration_sec=3)

model.get_frames_and_time()


***************** Extracting Frames From the Video *****************************
Frame rate: 30.0 FPS
Total frames extracted: 1698
Extracted 1698 frames.
***************** Finding The Stable Frames From The Video **********************
processing : 100/1698
processing : 200/1698
processing : 300/1698
Stable Frame Starts 4.166666666666667 and ends at 12.566666666666666 253 Required :  90
processing : 400/1698
processing : 500/1698
Stable Frame Starts 12.833333333333334 and ends at 19.266666666666666 194 Required :  90
processing : 600/1698
processing : 700/1698
processing : 800/1698
processing : 900/1698
processing : 1000/1698
Stable Frame Starts 19.766666666666666 and ends at 35.9 485 Required :  90
processing : 1100/1698
processing : 1200/1698
Stable Frame Starts 36.3 and ends at 41.5 157 Required :  90
processing : 1300/1698
processing : 1400/1698
processing : 1500/1698
Stable Frame Starts 42.0 and ends at 51.5 286 Required :  90
processing : 1600/1698
MoviePy - Writing audio in audi

                                                                      

MoviePy - Done.
Audio Extracted Successfully....
[0.56s -> 3.48s]  Let's learn k-means algorithm in just one minute.
[3.48s -> 9.16s]  In step 1, we have to decide how many clusters we want, which is denoted as k in k-means
[9.16s -> 10.16s]  clustering.
[10.16s -> 13.04s]  Let's say we are expecting 3 clusters in this case.
[13.04s -> 17.04s]  Then in step 2, we have to randomly initialize 3 data points.
[17.04s -> 19.84s]  These 3 data points are called centrites.
[19.84s -> 24.96s]  And in step 3, we have to assign each data point to its nearest centroid.
[24.96s -> 30.16s]  I mean take a data point, calculate distance from all three centroid and assign the data
[30.16s -> 32.02s]  point to its nearest centroid.
[32.02s -> 36.22s]  We have to do the same thing for each and every data point.
[36.22s -> 41.48s]  And in step 4, we have to update the centroid by taking mean of each group.
[41.48s -> 46.64s]  And in step 5, we have to repeat step 3 and 4 until the difference between prev

'Blog Created Successfully.......'

In [15]:
start_and_end_frame

[(5.733333333333333, 9.733333333333333),
 (10.2, 14.0),
 (15.866666666666667, 25.933333333333334),
 (26.166666666666668, 38.3),
 (38.43333333333333, 43.03333333333333),
 (43.333333333333336, 47.1),
 (48.43333333333333, 53.3),
 (55.13333333333333, 58.7)]

In [16]:
transcripton

[" In this video, I'm going to teach you how you can select features using recursive feature\n elimination.\n Let's say here we have five features and we need only top three features.",
 ' To build a machine learning model, the idea is take all the features',
 " build a model and drop the feature which contributes very less to the model.\n But how do we know which feature contributes less to the model?\n Well, if you're building a regression model,",
 " then you can use coefficients to make decision.\n You can drop the feature which has least coefficient value.\n Or if you are building a decision tree, there you can use feature importance.\n Let's say,",
 ' In this case, feature 4 has very less feature importance, then we have',
 ' drop this feature and we have to continue the process take the',
 ' mining features build a model and again look at the feature which has least feature importance and',
 ' drop that feature we have to continue this process until we get expected number of fea

In [None]:
# from utils.video2blog import VideoToBlog

video_path = "test_video1.mp4"

model = VideoToBlog(video_path,"K-Means",stable_duration_sec=3)

# model.get_frames_and_time()

In [6]:
start_and_end_frame

[(4.166666666666667, 12.566666666666666),
 (12.833333333333334, 19.266666666666666),
 (19.766666666666666, 35.9),
 (36.3, 41.5),
 (42.0, 51.5)]