# Prepare the dataset

In [25]:
import os
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt

In [26]:
DATA_PATH = r"W:\NLP\SLD\Video Database\videos"
NPY_PATH =  r"npy_data"

In [27]:
# Data structure to hold EDA results
eda_data = []

# Loop through each subdirectory
for subdir, dirs, files in os.walk(DATA_PATH):
    if subdir == DATA_PATH:
        continue
    video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov'))]
    num_videos = len(video_files)
    
    for video_file in video_files:
        video_path = os.path.join(subdir, video_file)
        cap = cv2.VideoCapture(video_path)
        
        # Extract metadata
        duration = cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)
        width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)
        height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
        frame_rate = cap.get(cv2.CAP_PROP_FPS)
        file_size = os.path.getsize(video_path)
        
        eda_data.append({
            'subdir': os.path.basename(subdir),
            'video_file': video_file,
            'duration': duration,
            'width': width,
            'height': height,
            'frame_rate': frame_rate,
            'file_size': file_size
        })
    print(subdir)

W:\NLP\SLD\Video Database\videos\أب
W:\NLP\SLD\Video Database\videos\إمراة
W:\NLP\SLD\Video Database\videos\ابيض
W:\NLP\SLD\Video Database\videos\احمر
W:\NLP\SLD\Video Database\videos\اخضر
W:\NLP\SLD\Video Database\videos\اخوات
W:\NLP\SLD\Video Database\videos\ازرق
W:\NLP\SLD\Video Database\videos\اسرة
W:\NLP\SLD\Video Database\videos\اسف
W:\NLP\SLD\Video Database\videos\اسود
W:\NLP\SLD\Video Database\videos\اصفر
W:\NLP\SLD\Video Database\videos\الألوان
W:\NLP\SLD\Video Database\videos\الاثنين
W:\NLP\SLD\Video Database\videos\الاحد
W:\NLP\SLD\Video Database\videos\الاربعاء
W:\NLP\SLD\Video Database\videos\الثلاثاء
W:\NLP\SLD\Video Database\videos\الجمعة
W:\NLP\SLD\Video Database\videos\الحمد لله
W:\NLP\SLD\Video Database\videos\الخميس
W:\NLP\SLD\Video Database\videos\السبت
W:\NLP\SLD\Video Database\videos\السلام عليكم
W:\NLP\SLD\Video Database\videos\الموت
W:\NLP\SLD\Video Database\videos\اليوم
W:\NLP\SLD\Video Database\videos\ام
W:\NLP\SLD\Video Database\videos\امس
W:\NLP\SLD\Video Da

In [28]:
# Convert to DataFrame for easier analysis
eda_df = pd.DataFrame(eda_data)

# Save the DataFrame to a CSV file
eda_df.to_csv('video_eda_results.csv', index=False)

# Summary statistics
summary = eda_df.describe()

print(summary)

          duration        width       height   frame_rate     file_size
count  5131.000000  5131.000000  5131.000000  5131.000000  5.131000e+03
mean      2.340423  1288.796726  1246.603001    29.525520  2.410265e+06
std       0.467290   599.631961   235.473349     3.268869  2.141002e+06
min       1.166656   576.000000  1080.000000    19.581479  2.024900e+05
25%       2.060811   720.000000  1080.000000    29.580000  9.071325e+05
50%       2.300000  1080.000000  1280.000000    30.000000  1.571511e+06
75%       2.600000  1920.000000  1280.000000    30.000000  3.356099e+06
max       4.840000  1920.000000  1920.000000   120.000000  1.669232e+07


In [29]:
import plotly.express as px

fig_duration = px.histogram(eda_df, x='duration', nbins=50, title='Distribution of Video Durations')
fig_duration.show()


In [30]:
yy = eda_df['subdir'].value_counts().reset_index()
yy

Unnamed: 0,subdir,count
0,ولد,97
1,إمراة,93
2,هل انت بخير,91
3,ماذا تعمل,91
4,أب,87
...,...,...
68,كيف حالك,49
69,ربع ساعه,49
70,السلام عليكم,49
71,طفل,44


In [31]:
fig_videos_per_subdir = px.bar(eda_df['subdir'].value_counts().reset_index(), x='subdir', y='count',
                               title='Number of Videos per Subdirectory')
fig_videos_per_subdir.show()

In [32]:
fig_file_size_vs_duration = px.scatter(eda_df, x='file_size', y='duration', color='subdir',
                                       labels={'file_size': 'File Size (bytes)', 'duration': 'Duration (s)'},
                                       title='File Size vs. Duration')
fig_file_size_vs_duration.show()


In [33]:
eda_df['resolution'] = eda_df['width'].astype(str) + 'x' + eda_df['height'].astype(str)
fig_resolution_heatmap = px.density_heatmap(eda_df, x='width', y='height', title='Heatmap of Video Resolutions')
fig_resolution_heatmap.show()


In [8]:
import numpy as np 

np.linspace(0, 60, 120, dtype=int)

array([ 0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,
        8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16,
       17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25,
       25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30, 31, 31, 32, 32, 33, 33,
       34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 39, 40, 40, 41, 41, 42,
       42, 43, 43, 44, 44, 45, 45, 46, 46, 47, 47, 48, 48, 49, 49, 50, 50,
       51, 51, 52, 52, 53, 53, 54, 54, 55, 55, 56, 56, 57, 57, 58, 58, 59,
       60])