In [1]:
%matplotlib inline

import cv2
import functools as ft
import itertools as it
import json
import matplotlib.pyplot as plt
import mxnet as mx
import numpy as np
import operator as op
import os
import pickle
import random
import sys
import hdbscan
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler

random.seed(1)
np.random.seed(1)
mx.random.seed(1)

def show_and_wait(frame, title='tesst', wait_time=0):
    cv2.imshow(title, frame)
    key = cv2.waitKey(wait_time)
    return None if key < 0 else chr(key)

def read_and_wait(video, title='tesst', wait_time=0):
    result, frame = video.read()
    if result:
        return show_and_wait(frame, title, wait_time)

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = it.tee(iterable)
    next(b, None)
    return zip(a, b)

def construct_data(file_path, extent, known_file_path=None, known_frame=-1, modify_frame=None):
    start_at = datetime.now()
    video = cv2.VideoCapture(file_path, cv2.CAP_MSMF)
    print(video.get(cv2.CAP_PROP_FRAME_COUNT), video.get(cv2.CAP_PROP_FPS))
    n = min(max(1, round(video.get(cv2.CAP_PROP_FRAME_COUNT) / 2000)), 30)
    top, left, bottom, right = extent
    def fn(frame):
        frame = frame[top:bottom, left:right]
        return modify_frame(frame) if modify_frame else frame
    def yield_frames():
        while True:
            result, frame = video.read()
            if not result:
                break
            yield frame
    frames = [fn(f) for i, f in enumerate(yield_frames()) if i % n == 0]
    video.release()

    if known_file_path:
        # Add a known positive frame.
        video = cv2.VideoCapture(known_file_path, cv2.CAP_MSMF)
        print(video.get(cv2.CAP_PROP_FRAME_COUNT), video.get(cv2.CAP_PROP_FPS))
        print(video.set(cv2.CAP_PROP_POS_FRAMES, video.get(cv2.CAP_PROP_FRAME_COUNT) if known_frame < 0 else known_frame))
        _, frame = video.read()
        frames.append(fn(frame))
        video.release()

    data = StandardScaler().fit_transform(np.stack([f.reshape(-1) for f in frames]))
    print(data.shape)
    print('total time:', (datetime.now() - start_at).total_seconds(), 'seconds')
    return frames, data

def print_clusters(frames, cluster_labels, has_known_cluster=False):
    print('cluster count:', len(np.unique(cluster_labels)))
    print(np.unique(cluster_labels))
    print(cluster_labels[:22])
    print(*enumerate(cluster_labels[:22]))
    print(*((k, len(list(v))) for k, v in it.groupby(np.sort(cluster_labels))))
    if has_known_cluster:
        if cluster_labels[-1] < 0:
            print('desired cluster unknown')
        else:
            print('desired cluster:', cluster_labels[-1])
    def fn():
        for n in np.unique(cluster_labels):
            g = (i for i, j in enumerate(cluster_labels) if j == n)
            g = it.islice(g, 5)
            stack = [frames[i] for i in g]
            yield np.hstack(stack)
    plt.imshow(np.vstack(list(fn())).take([2,1,0], axis=2))

def predict_cluster_labels(data, min_cluster_size=22):
    start_at = datetime.now()
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
    cluster_labels = clusterer.fit_predict(data)
    print('total time:', (datetime.now() - start_at).total_seconds(), 'seconds')
    return cluster_labels

def construct_data(input_file_path, output_file_path, extent, resize_factor=None):
    start_at = datetime.now()
    video = cv2.VideoCapture(input_file_path, cv2.CAP_MSMF)
    print('approximately', video.get(cv2.CAP_PROP_FRAME_COUNT) // 1000, 'seconds')
    top, left, bottom, right = extent
    n = 0
    with open(output_file_path, 'wb') as fout:
        while True:
            result, frame = video.read()
            if not result:
                break
            frame = frame[top:bottom, left:right]
            if resize_factor:
                frame = cv2.resize(frame, (0, 0), fx=resize_factor, fy=resize_factor, interpolation=cv2.INTER_AREA)
            pickle.dump(frame, file=fout)
            n += 1
        pickle.dump(None, file=fout)
    video.release()
    print(n, 'construction time:', (datetime.now() - start_at).total_seconds(), 'seconds')

def for_plt(image):
    return image.take([2,1,0], axis=2)

In [2]:
file_path = r"C:\Users\cidzerda\Documents\GitHub\strevr-data\tesst.mp4"
#file_path = r"C:\Users\cidzerda\Documents\GitHub\strevr-data\tesst\staycationyoutube1.mp4"
#file_path = r"C:\Users\cidzerda\Documents\GitHub\strevr-data\tesst\telshin1-cg.mp4"
video = cv2.VideoCapture(file_path, cv2.CAP_MSMF)

In [3]:
def fn(video, i):
    if video.set(cv2.CAP_PROP_POS_FRAMES, i):
        print(i, video.get(cv2.CAP_PROP_POS_FRAMES))
        result, frame = video.read()
        if result:
            cv2.imshow('tesst', frame)
            cv2.waitKey(0)
        else:
            print('cannot show frame at frame index', i)
    else:
        print('cannot set frame index', i)

n = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
print('frame count', n)
for i in range(7):
    fn(video, i)
for i in range(n - 3, n + 3):
    fn(video, i)
video.set(cv2.CAP_PROP_POS_FRAMES, 0)
print(video.get(cv2.CAP_PROP_POS_FRAMES))
cv2.destroyAllWindows()

frame count 17940
0 0.0
1 3.0
2 3.0
3 3.0
4 4.0
5 5.0
6 6.0
17937 17937.0
17938 17938.0
17939 17939.0
17940 17940.0
17941 17941.0
cannot show frame at frame index 17941
cannot set frame index 17942
0.0


In [36]:
n = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
print('frame count', n)
i = 0
last_frame = None
print(video.set(cv2.CAP_PROP_POS_FRAMES, i))
while True:
    result, frame = video.read()
    if result:
        if last_frame is not None and (last_frame == frame).all():
            print('duplicate frame at frame index', i)
        else:
            last_frame = frame
        i += 1
        video.get(cv2.CAP_PROP_POS_FRAMES)
    else:
        break
print(i)
cv2.destroyAllWindows()

frame count 5845
True
duplicate frame at frame index 3941
duplicate frame at frame index 3944
duplicate frame at frame index 3947
duplicate frame at frame index 3951
duplicate frame at frame index 3954
duplicate frame at frame index 3957
duplicate frame at frame index 3960
duplicate frame at frame index 3963
duplicate frame at frame index 3966
duplicate frame at frame index 3970
duplicate frame at frame index 3973
duplicate frame at frame index 3976
duplicate frame at frame index 3979
duplicate frame at frame index 3982
duplicate frame at frame index 3985
duplicate frame at frame index 3989
duplicate frame at frame index 3992
duplicate frame at frame index 3995
duplicate frame at frame index 3998
duplicate frame at frame index 4001
duplicate frame at frame index 4004
duplicate frame at frame index 4008
duplicate frame at frame index 4011
duplicate frame at frame index 4014
duplicate frame at frame index 4017
duplicate frame at frame index 4020
duplicate frame at frame index 4023
duplic

In [4]:
video.release()

In [41]:
from collections import Counter

def make_labels(g, n):
    labels = []
    for index, label in g:
        while len(labels) < index:
            labels.append(next_label)
        next_label = label
    while len(labels) < n:
        labels.append(next_label)
    return ''.join(labels)

directory_path = r'C:\Users\cidzerda\Documents\GitHub\strevr-data\archive\legends'
_, _, file_names = next(os.walk(directory_path), (None, None, []))
g = (s for s in file_names if s.endswith('.txt'))
for s in g:
    file_path = directory_path + '\\' + s
    video = cv2.VideoCapture(file_path.replace('txt', 'mp4'), cv2.CAP_MSMF)
    n = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    video.release()
    with open(file_path, 'rt') as fin:
        l = make_labels(eval(fin.read()), n)
    c = Counter(l)
    print(file_path.split('\\')[-1], n, c)
    g = ((k, v) for k, v in c.items() if k != 'z' and v >= 2000)
    #print(file_path.split('\\')[-1], n, *sorted(g))

anytimeshield.txt 14209 Counter({'z': 5603, 'b': 4661, 'r': 3202, 'i': 743})
mymisterfruit.txt 38339 Counter({'o': 21962, 'z': 10475, 'm': 5903})
nicewigg.txt 25019 Counter({'w': 20353, 'z': 4667})
nikolarn.txt 8842 Counter({'i': 6366, 'z': 2476})
shivfps.txt 15874 Counter({'b': 9774, 'z': 5722, 'p': 378})
staycationyoutube.txt 58860 Counter({'z': 27487, 'a': 21355, 'g': 5491, 'i': 4479, 'p': 48})
staycationyoutube1.txt 32065 Counter({'z': 20479, 'c': 11586})
tannerslays.txt 10560 Counter({'p': 10225, 'z': 335})
tannerslays1.txt 2760 Counter({'i': 2507, 'z': 253})
telshin-g.txt 13140 Counter({'g': 10021, 'z': 3119})
telshin-l.txt 11280 Counter({'l': 9916, 'z': 1364})
telshin-m.txt 27960 Counter({'m': 16522, 'z': 11438})
telshin-o.txt 23280 Counter({'o': 16764, 'z': 6516})
telshin1-c.txt 20940 Counter({'c': 16134, 'z': 4806})
telshin1-g.txt 20400 Counter({'g': 14425, 'z': 5975})
telshin1-l.txt 48300 Counter({'l': 27470, 'z': 20830})
telshin1-r.txt 18720 Counter({'r': 16724, 'z': 1996})


In [42]:
from collections import Counter

def make_labels(g, n):
    labels = []
    for index, label in g:
        while len(labels) < index:
            labels.append(next_label)
        next_label = label
    while len(labels) < n:
        labels.append(next_label)
    return ''.join(labels)

directory_path = r'C:\Users\cidzerda\Documents\GitHub\strevr-data\archive\legends'
file_names = 'staycationyoutube', 'shivfps', 'staycationyoutube1', 'telshin-l', 'mymisterfruit', 'tannerslays', 'telshin1-r', 'telshin1-w'
for s in file_names:
    file_path = directory_path + '\\' + s + '.txt'
    video = cv2.VideoCapture(file_path.replace('txt', 'mp4'), cv2.CAP_MSMF)
    n = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    video.release()
    with open(file_path, 'rt') as fin:
        l = eval(fin.read())
    l = make_labels(l, n)
    with open(r'C:\Users\cidzerda\Documents\GitHub\strevr-data\legends' + '\\' + s + '.txt', 'wt') as fout:
        print(l, file=fout)

In [45]:
file_path = r'C:\Users\cidzerda\Documents\GitHub\strevr-data\archive\legends.txt'
with open(file_path) as fin:
    print(fin.read())

p	thefyzu
gia	staycationyoutube
om	mymisterfruit
cb	staycationyoutube1
w	nicewigg
b	xednim
b	shivfps
rbi	anytimeshield
p	tannerslays
i	nikolarn
i	vsnz
i	tannerslays1



In [57]:
from collections import Counter
file_path = r'C:\Users\cidzerda\Documents\GitHub\strevr-data\legends\selected.txt'
with open(file_path) as fin:
    labels = [s for s in fin.read() if 'a' <= s <= 'z']
c = Counter(labels)
print(c)
with open(file_path.replace('txt', 'pickle'), 'rb') as fin:
    images = [pickle.load(fin) for _ in range(len(labels))]
print(len(images))

Counter({'o': 3030, 'm': 3030, 'b': 3030, 'p': 3030, 'g': 3030, 'i': 3030, 'a': 3030, 'c': 3030, 'l': 3030, 'r': 3030, 'w': 3030})
33330


In [83]:
def make_title_for_window():
    return 'tesst'

def show_images(images):
    n = len(images)
    height, width, *_ = images[0].shape
    s = int(math.sqrt(1.375 * height * width * n) / width)
    l = sorted((n % i, i) for i in range(s, 2 * s))
    ncolumns = l[-1][1] if l[0][0] else l[0][1]
    print(height, width, s)
    print(n, ncolumns, n / ncolumns)
    g = [iter(images)] * ncolumns
    g = it.zip_longest(*g, fillvalue=np.zeros_like(images[0]))
    cv2.imshow(make_title_for_window(), np.vstack([np.hstack(g) for g in g]))
    return height, width, ncolumns

def inspect_images(images, desired_count):
    images = images[:]
    window_name = make_title_for_window()
    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
    def mouse_callback(event, x, y, *_):
        if event == cv2.EVENT_LBUTTONDOWN:
            index = (y // height) * ncolumns + x // width
            images[index] = images[-1]
            del images[-1]
            show_images(images[:desired_count])
    cv2.setMouseCallback(window_name, mouse_callback)
    height, width, ncolumns = show_images(images[:desired_count])
    print(cv2.getWindowImageRect(window_name))
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    return images

l = [i for i, l in zip(images, labels) if l == 'b']
print(len(inspect_images(l, 3000)))
print(len(l))

30 35 59
3000 60 50.0
(-710, -1408, 2100, 1500)
30 35 59
3000 60 50.0
30 35 59
3000 60 50.0
3028
3030


In [73]:
import math
x = images[:3211]
n = len(x)
m = int(math.sqrt(n))
l = sorted((n % i, i) for i in range(m, 2 * m))
def mouse_callback(event, x, y, *_):
    if event == cv2.EVENT_LBUTTONDOWN:
        print(x, y, (y // 30) * w + x // 35)
cv2.namedWindow('tesst', cv2.WINDOW_NORMAL | cv2.WINDOW_GUI_EXPANDED)
cv2.setMouseCallback('tesst', mouse_callback)
for s in c:
    l = [i for i, l in zip(x, labels) if l == s]
    n = len(l)
    m = int(math.sqrt(n))
    s = sorted((n % i, i) for i in range(m, 2 * m))
    w = s[-1][1] if s[0][0] else s[0][1]
    g = [iter(l)] * w
    g = it.zip_longest(*g, fillvalue=np.zeros_like(images[0]))
    cv2.imshow('tesst', np.vstack([np.hstack(g) for g in g]))
    if chr(cv2.waitKey(0)) == 'q':
        break
cv2.destroyAllWindows()

In [18]:
images[0].shape

(30, 35, 3)

In [47]:
import math
x = images[:3029]
n = len(x)
m = int(math.sqrt(n))
s = sorted((n % i, i) for i in range(m, 2 * m))
w = s[-1][1] if s[0][0] else s[0][1]
print(n, w, n / w)
g = [iter(x)] * w
g = it.zip_longest(*g, fillvalue=np.zeros_like(images[0]))
#print(len(list(g)))
g = (len(list(g)) for g in g)
print(*g)

3029 101 29.99009900990099
101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101 101


In [8]:
l = [([1,2,3], it.repeat('a', 9)), ([4,5,6], it.repeat('b', 9))]
print(*(it.chain.from_iterable(zip(a, b) for a, b in l)))
l = [zip([1,2,3], it.repeat('a', 9)), zip([4,5,6], it.repeat('b', 9))]
print(*zip(*l))
l = [zip([1,2,3], it.repeat('a', 9)), zip([4,5,6], it.repeat('b', 9))]
print(*it.chain.from_iterable(l))

(1, 'a') (2, 'a') (3, 'a') (4, 'b') (5, 'b') (6, 'b')
((1, 'a'), (4, 'b')) ((2, 'a'), (5, 'b')) ((3, 'a'), (6, 'b'))
(1, 'a') (2, 'a') (3, 'a') (4, 'b') (5, 'b') (6, 'b')


In [6]:
with open(r"C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\selected.pickle", 'rb') as fin:
    images = []
    try:
        while True:
            images.append(pickle.load(fin))
    except EOFError:
        pass
len(images)

3000

In [None]:
from datetime import timedelta
label_file_path = r"C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\staycationyoutube1.txt"
video_file_path = r"C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\staycationyoutube1.mp4"
with open(label_file_path, 'rt') as fin:
    labels = [s for s in fin.read() if 'a' <= s <= 'z']
l = [len(list(v)) for k, v in it.groupby(labels) if k != 'z']
print(sum(l))
l = [(k, len(list(v))) for k, v in it.groupby(labels)]
#print(l)
l = list(it.accumulate(l, lambda a, b: (b[0], a[1] + b[1])))
#print(l)
l = list(zip(*l))
#print(l)
g = iter(l[0])
next(g)
l[0] = g
l = list(zip(*l))
#print(l)
g = iter(l)
g = it.chain(g, it.repeat(('-', 0)))
video = cv2.VideoCapture(video_file_path, cv2.CAP_MSMF)
while True:
    result, frame = video.read()
    if not result:
        break
    cv2.imshow('tesst', frame)
    ch = chr(cv2.waitKey(0))
    if ch == 'q':
        break
    if ch == 'n':
        s, i = next(g)
        print(s, i, timedelta(seconds=i / video.get(cv2.CAP_PROP_FPS)))
        video.set(cv2.CAP_PROP_POS_FRAMES, i)
    elif ch == ',':
        video.set(cv2.CAP_PROP_POS_FRAMES, video.get(cv2.CAP_PROP_POS_FRAMES) - 2)
video.release()
cv2.destroyAllWindows()

1070
c 16870 0:09:25.033658
z 17289 0:09:39.067393
c 17299 0:09:39.402327
z 17950 0:10:01.206530
- 0 0:00:00


In [26]:
print('4:50.651', 8738, 8943 - 8739)
print('4:58.002', 8959, 9119 - 8960)
print('29:07.532', 52537, 52613 - 52538)
print('29:10.460', 52625, 52672 - 52626)

4:50.651 8738 204
4:58.002 8959 159
29:07.532 52537 75
29:10.460 52625 46


In [5]:
from collections import Counter
label_file_paths = [
    r"C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\staycationyoutube1.txt",
    r"C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\vsnz1.txt",
]
for label_file_path in label_file_paths:
    with open(label_file_path, 'rt') as fin:
        print(label_file_path, Counter(s for s in fin.read() if 'a' <= s <= 'z'))

C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\staycationyoutube1.txt Counter({'z': 30995, 'c': 1070})
C:\Users\cidzerda\Documents\GitHub\strevr-data\idle\vsnz1.txt Counter({'z': 58403, 'x': 7090, 'y': 569})
