# Выполним аналитику датасета.

При предпросмотре данных, было определено, что для некоторых отрывков "start" > "end", что не является логичным. Время конца отрывка не может быбть раньше времени его начала, поэтому сначала необходимо исследовать и исправить таргеты в ```test_labels.json``` и ```train_labels.json```. Всего нам дано 80 + 45 = 125 видео, определим видео с неверными временными метками заставки и исправим их на верные. Было замечено, что у таких видео необходимо время начала уменьшить на 1 минуту.

In [1]:
#импорт базовых библиотек
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import json
import os
from datetime import datetime, timedelta

In [2]:
#алгоритм поиска неверных временных меток

#подгрузим данные
project_dir = '/kaggle/input/vk-series-data/'
test_target = os.path.join(project_dir, 'test_labels.json')
train_target = os.path.join(project_dir, 'train_labels.json')

with open(test_target) as f:
    test_target_dict = json.load(f)

with open(train_target) as f:
    train_target_dict = json.load(f)

def process_bad_times(dictionary):

    for description in dictionary.values():
        start_time = datetime.strptime(description["start"], "%H:%M:%S")
        end_time = datetime.strptime(description["end"], "%H:%M:%S")
        if start_time > end_time:
            start_time = start_time - timedelta(minutes=1)
            description["start"] = start_time.strftime("%H:%M:%S")

process_bad_times(test_target_dict)
process_bad_times(train_target_dict)

Теперь можно изучать статистические параметры времени заставок

In [3]:
# Загрузим данные в датафрейм
df = pd.concat([pd.DataFrame(test_target_dict), pd.DataFrame(train_target_dict)], axis = 1)
df = df.T
df["start"] = pd.to_datetime(df["start"], format = "%H:%M:%S").dt.time
df["end"] = pd.to_datetime(df["end"], format = "%H:%M:%S").dt.time
df

Unnamed: 0,url,name,start,end
-220020068_456249220,https://vkvideo.ru/video-220020068_456249220,"24 часа. 2 сезон, 16 серия",00:00:05,00:00:16
-220020068_456249373,https://vkvideo.ru/video-220020068_456249373,"24 часа. 8 сезон, 18 серия",00:00:05,00:00:16
-220020068_456249231,https://vkvideo.ru/video-220020068_456249231,"24 часа. 3 сезон, 1 серия",00:00:05,00:00:16
-220020068_456255339,https://vkvideo.ru/video-220020068_456255339,Анатомия скандала. 1 сезон. 2 серия.,00:00:10,00:00:20
-220020068_456249284,https://vkvideo.ru/video-220020068_456249284,"24 часа. 5 сезон, 4 серия",00:00:05,00:00:16
...,...,...,...,...
-220020068_456249719,https://vkvideo.ru/video-220020068_456249719,"Бывaeт и xyжe. 2 сезон ,6 серия",00:00:06,00:00:10
-220020068_456255400,https://vkvideo.ru/video-220020068_456255400,"Баскетс. 3 сезон, 3 серия",00:00:17,00:00:22
-220020068_456256446,https://vkvideo.ru/video-220020068_456256446,"Бойцовская ночь: Афера на миллион. 1 сезон, 4 ...",00:04:41,00:05:25
-220020068_456255401,https://vkvideo.ru/video-220020068_456255401,"Баскетс. 3 сезон, 4 серия",00:00:17,00:00:22


In [4]:
# спарсим необходимую информацию о каждом видео

input_dir = '/kaggle/input/vk-series-data/'

test_path = os.path.join(project_dir, 'data_test_short/data_test_short/')
train_path = os.path.join(project_dir, 'data_train_short/data_train_short/')

videos_info = {}

def scan_directories(root_path):
    
    subfolders = [f.name for f in os.scandir(root_path) if f.is_dir()]

    for folder in subfolders:
        for file in os.listdir(root_path + folder):
            cap = cv2.VideoCapture(root_path + folder + "/" + file)
            
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            fps = cap.get(cv2.CAP_PROP_FPS)
            width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
            height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
            duration_sec = frame_count / fps
            duration = timedelta(seconds=int(duration_sec))

            videos_info[folder] = [frame_count, fps, width, height, duration_sec, duration]
            
scan_directories(test_path)
scan_directories(train_path)

df_videos = pd.DataFrame(videos_info).T
df_videos.columns = ["frame_count", "fps","width","height","duration_sec","duration"]

df = df.join(df_videos, how="outer")

In [5]:
df[:20]

Unnamed: 0,url,name,start,end,frame_count,fps,width,height,duration_sec,duration
-220020068_456239859,https://vkvideo.ru/video-220020068_456239859,"3вeздный пyть: Пикap. 3 сезон, 10 серия",00:00:15,00:00:28,90888,24.000034,1920,1080,3786.994589,1:03:06
-220020068_456241671,https://vkvideo.ru/video-220020068_456241671,"Амepикaнcкий вaндaл. 1 сезон, 3 серия",00:02:47,00:03:10,53203,23.975581,852,480,2219.049444,0:36:59
-220020068_456241671,https://vkvideo.ru/video-220020068_456241671,"Амepикaнcкий вaндaл. 1 сезон, 3 серия",00:02:47,00:03:10,53203,23.975581,852,480,2219.049444,0:36:59
-220020068_456241672,https://vkvideo.ru/video-220020068_456241672,"Амepикaнcкий вaндaл. 1 сезон, 4 серия",00:02:51,00:03:15,48552,23.975539,852,480,2025.064,0:33:45
-220020068_456241673,https://vkvideo.ru/video-220020068_456241673,"Амepикaнcкий вaндaл. 1 сезон, 5 серия",00:01:52,00:02:17,47472,23.975528,852,480,1980.019,0:33:00
-220020068_456241682,https://vkvideo.ru/video-220020068_456241682,"Амepикaнcкий вaндaл. 2 сезон, 6 серия",00:01:35,00:02:11,40472,23.975442,852,480,1688.060656,0:28:08
-220020068_456241755,https://vkvideo.ru/video-220020068_456241755,"Блеск. 2 сезон, 2 серия",00:03:08,00:03:12,45267,23.976562,852,480,1887.968722,0:31:27
-220020068_456241756,https://vkvideo.ru/video-220020068_456241756,"Блеск. 2 сезон, 3 серия",00:01:06,00:01:10,38314,23.976106,852,480,1598.007622,0:26:38
-220020068_456241758,https://vkvideo.ru/video-220020068_456241758,"Блеск. 2 сезон, 5 серия",00:00:58,00:01:02,44404,23.976573,852,480,1851.974422,0:30:51
-220020068_456241758,https://vkvideo.ru/video-220020068_456241758,"Блеск. 2 сезон, 5 серия",00:00:58,00:01:02,44404,23.976573,852,480,1851.974422,0:30:51


In [6]:
df['frame_count'] = df['frame_count'].astype('float')
df['fps'] = df['fps'].astype('float')
df['width'] = df['width'].astype('float')
df['height'] = df['height'].astype('float')
df['duration_sec'] = df['duration_sec'].astype('float')
df['start'] = df['start'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)
df['end'] = df['end'].apply(lambda x: x.hour*3600 + x.minute*60 + x.second)
df['end'].iloc[60] = 487
df['frames_intro'] = df['end'] - df['start'] 

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['end'].iloc[60] = 487
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['end'].iloc[60] = 487


In [7]:
df.describe()

Unnamed: 0,start,end,frame_count,fps,width,height,duration_sec,frames_intro
count,125.0,125.0,125.0,125.0,125.0,125.0,125.0,125.0
mean,100.04,112.976,55334.256,24.217773,943.168,530.688,2287.213229,12.936
std,135.770763,143.791149,18015.071609,0.431973,197.15021,109.862701,747.21255,13.56094
min,0.0,8.0,30600.0,23.975442,480.0,360.0,1223.993322,2.0
25%,5.0,16.0,36750.0,23.976102,852.0,480.0,1510.96,5.0
50%,58.0,67.0,61499.0,23.976669,852.0,480.0,2565.055067,10.0
75%,117.0,140.0,65503.0,24.000744,852.0,480.0,2704.026278,11.0
max,708.0,749.0,97281.0,25.000777,1920.0,1080.0,4057.339522,87.0
