<a href="https://colab.research.google.com/github/OgunSerifOnargan/arkitek_notebooks/blob/main/videoMAE_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Collect Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import cv2
import warnings
warnings.filterwarnings('ignore')

In [None]:
#output folder creation
def videomae_folder_order_creator(output_folder_path, class_names):
  shutil.rmtree(output_folder_path, ignore_errors=True)
  for folder in ["train","val","test"]:
    for class_name in class_names:
      class_folder_path = output_folder_path + "/" + folder + "/" + class_name
      os.makedirs(class_folder_path, exist_ok=True)

def excel_train_val_test_splitter(df_excel, division_ratios):
  len_train = int(len(df_excel)*division_ratios[0])
  len_val = int(len(df_excel)*division_ratios[1])
  len_test = int(len(df_excel)*division_ratios[2])

  train_df = df_excel[:len_train]
  val_df = df_excel[len_train : (len_train+len_val)]
  test_df = df_excel[(len_train+len_val):]

  return train_df, val_df, test_df


def process_avideo(video_path, label, frame_interval, output_folder_path, folder):

  max_frame, min_frame = frame_interval

  cap = cv2.VideoCapture(video_path)

  new_videos_df = pd.DataFrame(columns=["video_name", "num_frames", "label"])
  frames = []
  try:
      frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
      fps = cap.get(cv2.CAP_PROP_FPS)
      while True:
          ret, frame = cap.read()
          if not ret:
              break
          frames.append(frame)
  finally:
      cap.release()

  video_index = 0
  while frames:
      video_name = video_path.split("/")[-1].split(".")[0]
      new_video_path = f"{output_folder_path}/{folder}/{label}/{video_name}_{video_index}.mp4"

      new_video_frames = frames[:max_frame]
      frames = frames[max_frame:]
      frame_shape = new_video_frames[0].shape[:2][::-1]

      if len(new_video_frames) >= min_frame:
        out = cv2.VideoWriter(new_video_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, frame_shape)
        for frame in new_video_frames:
            out.write(frame)
        out.release()

        new_video_name = new_video_path.split("/")[-1]
        new_row = {"video_name":new_video_name, "num_frames":len(new_video_frames) ,"label":label}
        new_videos_df = new_videos_df.append(new_row, ignore_index=True)

        video_index += 1
      else:
        break
  return new_videos_df


def control_folder_size(path):

  files = os.listdir(path)

  # Count the number of files
  num_files = len(files)

  print(f"Number of files in the folder: {num_files}")


In [None]:
def main_data_prep(root_video_folder_path, excel_file_path, output_folder_path, division_ratios, class_names, frame_interval, balanced=False):
  #read excel file from original path
  df_excel = pd.read_excel(excel_file_path, header=None)

  #manipulate labels according to class names
  df_excel[2] = df_excel[2].replace({i : class_names[i] for i in df_excel[2].unique()})
  classes = df_excel[2].unique()

  #output folder creation
  videomae_folder_order_creator(output_folder_path, class_names)
  df_excel = df_excel.sample(frac=1, random_state=42).reset_index(drop=True)
  #dividing data and convert it to df
  train_df, val_df, test_df = excel_train_val_test_splitter(df_excel, division_ratios)

  df_list = [train_df, val_df, test_df]
  df_names = ["train", "val", "test"]

  process_counter_main = 0
  for i, df in enumerate(df_list):
    df_name = df_names[i]
    final_df = pd.DataFrame(columns=["video_name", "num_frames", "label"])

    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing videos"):
      video_path = root_video_folder_path + "/" + row[0]
      label = row[2]

      new_videos_df = process_avideo(video_path, label, frame_interval, output_folder_path, df_name)

      final_df = pd.concat([final_df, new_videos_df], ignore_index=True)

    final_df.to_excel(f"{output_folder_path}/{df_name}.xlsx", index=False)
  control_folder_size("/content/deneme/train/fight")
  control_folder_size("/content/deneme/train/no_fight")

  if balanced == True:
    train_df_new_path = f"{output_folder_path}/train.xlsx"
    train_df_new = pd.read_excel(train_df_new_path, header=None)
    train_df_new = train_df_new.drop(0)
    train_df_new = train_df_new.reset_index(drop=True)
    tag_0_rows = train_df_new[train_df_new[2] == classes[0]]
    tag_1_rows = train_df_new[train_df_new[2] == classes[1]]

    num_samples_0 = len(tag_0_rows)
    num_samples_1 = len(tag_1_rows)

    if num_samples_1 < num_samples_0:
      balanced_tag_0_rows = tag_0_rows.sample(n=num_samples_1, random_state=42)
      balanced_df = pd.concat([balanced_tag_0_rows, tag_1_rows], ignore_index=True)
    if num_samples_1 > num_samples_0:
      balanced_tag_1_rows = tag_1_rows.sample(n=num_samples_0, random_state=42)
      balanced_df = pd.concat([balanced_tag_1_rows, tag_0_rows], ignore_index=True)

    train_df_last = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    video_folder = f"{output_folder_path}/train"

    for root, dirs, files in os.walk(video_folder):
        for file in files:
            video_path = os.path.join(root, file)
            video_name = video_path.split("/")[-1]
            if video_name not in train_df_last[0].values:
                print(f"Deleting video: {video_path}")
                os.remove(video_path)  # Delete the video file

    print("Cleanup complete. Now, you have balanced training dataset")
    control_folder_size("/content/deneme/train/fight")
    control_folder_size("/content/deneme/train/no_fight")


In [None]:
main_data_prep(root_video_folder_path="/content/drive/MyDrive/arkitek_fobi_analiz/Model_Datasets/video_classification/fight/training_videos_19-09/all_fight/scenes",
               excel_file_path="/content/drive/MyDrive/arkitek_fobi_analiz/Model_Datasets/video_classification/fight/training_videos_19-09/all_fight/all_updated_video_paths_final.xlsx",
               output_folder_path="/content/deneme",
               division_ratios=[0.7, 0.2, 0.1],
               class_names=["no_fight", "fight"],
               frame_interval=[30, 25],
               balanced=True)

Processing videos: 100%|██████████| 1137/1137 [19:02<00:00,  1.00s/it]
Processing videos: 100%|██████████| 325/325 [05:22<00:00,  1.01it/s]
Processing videos: 100%|██████████| 163/163 [02:40<00:00,  1.01it/s]


Number of files in the folder: 795
Number of files in the folder: 934
Deleting video: /content/deneme/train/no_fight/ogun_v12_8.mp4
Deleting video: /content/deneme/train/no_fight/baran_v82_1.mp4
Deleting video: /content/deneme/train/no_fight/arif_v31_0.mp4
Deleting video: /content/deneme/train/no_fight/arif_v7_2.mp4
Deleting video: /content/deneme/train/no_fight/uygar_v12_15.mp4
Deleting video: /content/deneme/train/no_fight/ogun_v28_5.mp4
Deleting video: /content/deneme/train/no_fight/arif_v59_2.mp4
Deleting video: /content/deneme/train/no_fight/uygar_v102_1.mp4
Deleting video: /content/deneme/train/no_fight/ogun_v30_1.mp4
Deleting video: /content/deneme/train/no_fight/baran_v127_0.mp4
Deleting video: /content/deneme/train/no_fight/ogun_v2_0.mp4
Deleting video: /content/deneme/train/no_fight/efe_v267_1.mp4
Deleting video: /content/deneme/train/no_fight/uygar_v195_3.mp4
Deleting video: /content/deneme/train/no_fight/efe_v133_0.mp4
Deleting video: /content/deneme/train/no_fight/arif_v47

In [None]:
control_folder_size("/content/deneme/train/no_fight")

Number of files in the folder: 795
