<a href="https://colab.research.google.com/github/Raghub123/code_repo/blob/main/mp4_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**MODULE1:
Below Module sets the necessary installations and spark imports**

In [None]:
#necessary installation steps

from google.colab import drive
drive.mount('/content/drive')
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark
findspark.init()
!pip install numpy
!pip install keras-ocr
!apt-get install tesseract-ocr
!pip install pytesseract

**MODULE 2 : Extracts every frame from the input mp4 and stores it in output_images directory and also stores the frame info in frame_data.txt in the format " timestamp of the frame| frame name**

Prerequisites:
1. upload the video only in "/content/drive/My Drive/My_Folder/" and save it as sample_video.mp4


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
import cv2
import os
import shutil
import numpy as np

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("VideoToImages") \
    .getOrCreate()

# Function to check if two frames are identical
def frames_identical(frame1, frame2):
    return np.array_equal(frame1, frame2)

# Function to convert video to images
def video_to_images(video_path, output_folder,frame_file):
    # Open the video file
    video_capture = cv2.VideoCapture(video_path)
    fps = int(video_capture.get(cv2.CAP_PROP_FPS))
    total_frames = int(video_capture.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = total_frames / fps

    # Read the first frame
    success, frame = video_capture.read()
    count = 0

    prev_frame = None
    # Loop through the video frames
    while success:
        # Write the frame to an image file
        if prev_frame is not None and frames_identical(frame, prev_frame):
            # Skip saving duplicate frame
            print(f"Skipping duplicate frame {count}")
        else:
            # Convert frame to JPEG format
            cv2.imwrite(output_folder + "/frame%d.jpg" % count, frame)
            frame_time_seconds = count / fps
            frame_time_minutes = int(frame_time_seconds // 60)
            frame_time_seconds %= 60
            frame_time_hours = int(frame_time_minutes // 60)
            frame_time_minutes %= 60

        # Get frame time in HH:MM:SS format
            frame_time = "{:02d}:{:02d}:{:02d}".format(frame_time_hours, frame_time_minutes, int(frame_time_seconds))
            with open(os.path.join(frame_file), "a") as file:
               file.write(f"Frame Time: {frame_time}|")
               file.write(f"frame%d.jpg\n" % count)
        prev_frame = frame.copy()

        # Read the next frame
        success, frame = video_capture.read()
        count += 1


    video_capture.release()

# Define paths
video_path = "/content/drive/My Drive/My_Folder/sample_video.mp4"
output_folder = "/content/drive/My Drive/My_Folder/output_images"
frame_file ="/content/drive/My Drive/My_Folder/frame_data.txt"

if os.path.exists(output_folder):
   shutil.rmtree(output_folder)
   os.makedirs(output_folder)
if os.path.exists(frame_file):
   os.remove(frame_file)

# Convert video to images using Spark
spark.sparkContext.parallelize([video_path]).foreach(lambda path: video_to_images(path, output_folder,frame_file))

# Stop SparkSession
spark.stop()


**MODULE 3: It reads each and every frame present in output_images dir created by module 2 and extratcs the text in each frame and stores in output.txt in the format "frame name###text"**


In [None]:
import os
import numpy as np
import pytesseract
from PIL import Image
import gc

# Function to read images from output_images directory
def read_images_from_directory(directory):
    images = []
    file_names = []
    for img_name in os.listdir(directory):
        img_path = os.path.join(directory, img_name)
        img = Image.open(img_path)
        images.append(img)
        file_names.append(img_name)  # Extract file name
    return images,file_names

# previous frame extraction module stores frames in below mentioned directory
directory = '/content/drive/My Drive/My_Folder/output_images'

# Read images from directory
images,file_names = read_images_from_directory(directory)

def delete_file(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

# Function to perform text recognition using Pytesseract
def recognize_text(images, file_names, output_file):
    with open(output_file, 'a') as f:
        for img, file_name in zip(images, file_names):
            # Recognize text using Pytesseract
            text = pytesseract.image_to_string(img, lang='eng', config='--psm 6').replace('\n', '').strip()
            # Write filename and text to output file
            f.write(f"{file_name} ### {text}\n")
            del file_name,text
            gc.collect()

# Specify the output file path
output_file = "/content/drive/My Drive/My_Folder/output.txt"

# Delete the output file if it exists
delete_file(output_file)

texts = recognize_text(images, file_names, output_file)


# Cleanse the text by removing unwanted characters
def cleanse_text(text):
    if text is None:
        return ""  # Return empty string if text is None
    else:
        # Remove '\n' and '\x0c' characters and strip leading/trailing whitespace
        return text.replace('\n', '').replace('\x0c', '').strip()

**MODULE 4: It plays the input video when executed and prompts the user to search for the text and fetches the respective frame info and timestamp of the frame at which text is occuring and haults the video at that point**

Prerequisite:
1. The input video should be uploaded only in google drive with the name "sample_vide.mp4" under /content/drive/MyDrive/My_Folder/o
2. Get the video Id manually from the google drive and paste in row 19 "google_drive_video_id " variable

In [None]:
from pyspark.sql import SparkSession
from IPython.display import display, HTML

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("VideoPlayer") \
    .getOrCreate()

# Define a function to open the video link in a new tab with a specific start time
def open_video_link(video_id, start_time=None):
    if start_time is None:
        video_link = f"https://drive.google.com/file/d/{video_id}/preview"
    else:
        video_link = f"https://drive.google.com/file/d/{video_id}/preview#t={start_time}"
    html_code = f'<iframe src="{video_link}" width="640" height="480"></iframe>'
    display(HTML(html_code))

# Provide the ID of your video hosted on Google Drive
google_drive_video_id = "1iw0c9KM6kU2PxLB_13YAaTdHnc0uvZ5q"  # Example ID
#https://drive.google.com/file/d/1iw0c9KM6kU2PxLB_13YAaTdHnc0uvZ5q/view?usp=drive_link

# Open the video link in a new tab without specifying the start time
open_video_link(google_drive_video_id)

# Prompt the user to manually start the video and then enter the start time (in seconds)
input("Manually start the video by clicking the play button, then press Enter to continue...")


# Load the text file as a DataFrame
text_file_path = "/content/drive/MyDrive/My_Folder/output.txt"  # Replace with the actual path to your file
text_df = spark.read.text(text_file_path)

# Prompt the user to enter the text to search for
search_text = input("Enter the text to search for: ").strip().upper()

# Define a function to extract frame number and text from each line
def extract_frame_text(line):
    parts = line.split("###")
    if len(parts) == 2:
        frame_number, text = parts
        return (frame_number.strip(), text.strip())
    return (None, None)

# Apply the extraction function and filter out None values
frame_text_df = text_df.rdd.map(lambda row: extract_frame_text(row.value)).filter(lambda x: x[1] is not None).toDF(["FrameNumber", "Text"])

# Filter the DataFrame to find the first occurrence of the search text
matching_row = frame_text_df.filter(frame_text_df["Text"].contains(search_text)).first()

if matching_row is None:
    print(f"The text '{search_text}' was not found in the video.")
else:
    frame_number = matching_row["FrameNumber"]
    print(f"The text '{search_text}' was found in frame number: {frame_number}")

    # Load the data_frame.txt file as a DataFrame
    data_frame_path = "/content/drive/MyDrive/My_Folder/frame_data.txt"  # Replace with the actual path to your file
    data_frame_df = spark.read.text(data_frame_path)

    # Extract frame number from the search result
    frame_number_to_search = frame_number.split(".")[0]  # Assuming frame_number is in the format "frameX.jpg"

    # Filter the DataFrame to find the corresponding timestamp
    timestamp_row = data_frame_df.filter(data_frame_df["value"].contains(frame_number_to_search)).first()

    if timestamp_row is None:
        print(f"Timestamp not found for frame number: {frame_number}")
    else:
        timestamp = timestamp_row["value"].split("|")[0].strip().replace('Frame Time:','')
        print(timestamp)
        # Split the time string into hours, minutes, and seconds
        hours, minutes, seconds = map(int, timestamp.split(":"))

        # Convert hours, minutes, and seconds to seconds and sum them up
        total_seconds = hours * 3600 + minutes * 60 + seconds

        # Continue playing the video from the user-entered time
        open_video_link(google_drive_video_id, total_seconds)

# Stop SparkSession
spark.stop()
