In [1]:
#------------- All the packages and paths
import pytesseract
import cv2
from object_detection.utils import ops as utils_ops
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import peakutils
import Levenshtein

from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image

from utils import label_map_util
from utils import visualization_utils as vis_util

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import pandas as pd

import statistics as stats
# This is needed since the notebook is stored in the object_detection folder.
sys.path.append("..")


# A Tesseract executable needs to be imported for the OCR
# Tesseract Repo: https://github.com/tesseract-ocr/tessdoc
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'



# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_CKPT = "C:/tensorflow1/models/research/object_detection/inference_graph/frozen_inference_graph.pb"

path_to_vid = "C:/tensorflow1/models/research/object_detection/own_testing/test3.mp4"

#-------------- Load a (frozen) Tensorflow model into memory.

detection_graph = tf.Graph()
with detection_graph.as_default():
    od_graph_def = tf.GraphDef()
    with tf.gfile.GFile(PATH_TO_CKPT, 'rb') as fid:
        serialized_graph = fid.read()
        od_graph_def.ParseFromString(serialized_graph)
        tf.import_graph_def(od_graph_def, name='')

#------------- Loading label map

# Instead of saving it as JSON earlier, I now retyped it manually as a dictionary - not smart, but works.
category_index = {1:  {'id': 1,  'name': 'app'}, 2: {'id': 2,  'name': 'activity'}, 3: {'id': 3,  'name': 'not_over'}, 4:
                  {'id': 4,  'name': 'not_app'}, 5: {'id': 5,  'name': 'pick_over'}, 6: {'id': 6,  'name': 'pick_app'},
                  7: {'id': 7,  'name': 'use_over'}, 8: {'id': 8,  'name': 'week'}}

#------------- Object Detection Functions



# A function that outputs the boxes and
## all the auxiliary data for an image.
### Implemented to process all images in one TF session
#### to save time initializing a seession for every image
def detect_multiple_images(images, graph):
    with graph.as_default():
        with tf.Session() as sess:
            output_dicts = []
            for index, image in enumerate(images):
                ops = tf.get_default_graph().get_operations()
                all_tensor_names = {
                    output.name for op in ops for output in op.outputs}
                tensor_dict = {}
                for key in ['num_detections', 'detection_boxes', 'detection_scores',
                            'detection_classes', 'detection_masks']:
                    tensor_name = key + ':0'
                    if tensor_name in all_tensor_names:
                        tensor_dict[key] = tf.get_default_graph(
                        ).get_tensor_by_name(tensor_name)
                if 'detection_masks' in tensor_dict:
                    # The following processing is only for single image
                    detection_boxes = tf.squeeze(
                        tensor_dict['detection_boxes'], [0])
                    detection_masks = tf.squeeze(
                        tensor_dict['detection_masks'], [0])
                    # Reframe is required to translate mask from box coordinates to image coordinates and fit the image size.
                    real_num_detection = tf.cast(
                        tensor_dict['num_detections'][0], tf.int32)
                    detection_boxes = tf.slice(detection_boxes, [0, 0], [
                                               real_num_detection, -1])
                    detection_masks = tf.slice(detection_masks, [0, 0, 0], [
                                               real_num_detection, -1, -1])
                    detection_masks_reframed = utils_ops.reframe_box_masks_to_image_masks(
                        detection_masks, detection_boxes, image.shape[0], image.shape[1])
                    detection_masks_reframed = tf.cast(
                        tf.greater(detection_masks_reframed, 0.5), tf.uint8)
                    # Follow the convention by adding back the batch dimension
                    tensor_dict['detection_masks'] = tf.expand_dims(
                        detection_masks_reframed, 0)
                image_tensor = tf.get_default_graph().get_tensor_by_name('image_tensor:0')

                # Run inference
                output_dict = sess.run(tensor_dict,
                                       feed_dict={image_tensor: np.expand_dims(image, 0)})

                # all outputs are float32 numpy arrays, so convert types as appropriate
                output_dict['num_detections'] = int(
                    output_dict['num_detections'][0])
                output_dict['detection_classes'] = output_dict[
                    'detection_classes'][0].astype(np.uint8)
                output_dict['detection_boxes'] = output_dict['detection_boxes'][0]
                output_dict['detection_scores'] = output_dict['detection_scores'][0]
                if 'detection_masks' in output_dict:
                    output_dict['detection_masks'] = output_dict['detection_masks'][0]
                output_dicts.append(output_dict)
    return output_dicts

# getting absolute box coordinates from the relative ones (output of running an inference graph),
## only keeping boxes that detected a field with certainty,
### returns a list with [(coordinates), probability, label type]
def get_abs_coord(image_np, output_dict, thresh):
    coordinates = vis_util.return_coordinates(
        image_np,
        np.squeeze(output_dict['detection_boxes']),
        np.squeeze(output_dict['detection_classes']).astype(np.int32),
        np.squeeze(output_dict['detection_scores']),
        category_index,
        use_normalized_coordinates=True,
        line_thickness=8,
        skip_labels=False,
        min_score_thresh=thresh)
    return coordinates

#------------ Reading the frames

# Helper functions for key frame detection
def scale(img, xScale, yScale):
    res = cv2.resize(img, None, fx=xScale, fy=yScale,
                     interpolation=cv2.INTER_AREA)
    return res

def convert_frame_to_grayscale(frame):
    grayframe = None
    gray = None
    if frame is not None:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        gray = scale(gray, 1, 1)
        grayframe = scale(gray, 1, 1)
        gray = cv2.GaussianBlur(gray, (9, 9), 0.0)
    return grayframe, gray

# A function to load a video into numpy arrays by frame, with the possibility to skip n frames.
# Below a more sophisticated version is included, too
def vid_to_np(path, frame_skip):
    cap = cv2.VideoCapture(path)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    all_vids = []
    count = 0
    skipper = frame_skip
    while cap.isOpened():
        ret, frame = cap.read()
        if count < (length - skipper):
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            all_vids.append(frame)
            count += skipper
            cap.set(1, count)
        else:
            cap.release()
            break
    return all_vids

# Thres stands for threshold - how dissimilar do the frames have to be included?
# Possible values range: 0 - 1
def keyframeDetection(source, Thres):

    # Open the video
    cap = cv2.VideoCapture(source)
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if (cap.isOpened() == False):
        print("Error opening video file")

    # Empty lists that will be populated
    lstfrm = []
    lstdiffMag = []
    images = []
    full_color = []
    lastFrame = None

    # Read until video is completed
    for i in range(1, length):
        ret, frame = cap.read()
        if ret:
            # Grayscale
            grayframe, blur_gray = convert_frame_to_grayscale(frame)
            # Keeping count of frames to keep comparing to the last one
            frame_number = cap.get(cv2.CAP_PROP_POS_FRAMES) - 1
            lstfrm.append(frame_number)
            images.append(grayframe)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            full_color.append(rgb_frame)
            if frame_number == 0:
                lastFrame = blur_gray
            # Calculate the difference between two frames
            diff = cv2.subtract(blur_gray, lastFrame)
            diffMag = cv2.countNonZero(diff)
            lstdiffMag.append(diffMag)
            # Current frame becomes last frame
            lastFrame = blur_gray
        else:
            break

    cap.release()
    y = np.array(lstdiffMag)
    # A neat function to index the frames the difference between which meets the threshold
    # Props to Stack Overflow for this
    # To like 70% of the whole code, matter of fact
    base = peakutils.baseline(y, 2)
    indices = peakutils.indexes(y-base, Thres, min_dist=1)

    final = []
    for x in indices:
        final.append(full_color[x])

    cv2.destroyAllWindows()
    return(final)


#---------------- OCR functions

# Takes an app image and returns a dataframe with the detected words
# Trail determines how much to cut off from the right side
def app_to_text(img, leftSide, blurry, trail, v_type):
    height, width, channels = img.shape
    if ph1 < 120:
        img = cv2.bitwise_not(img)
    # resizing if the image is too small
    img = cv2.resize(img, (int(width*(150/height)), 150),
                     interpolation=cv2.INTER_LANCZOS4)
    # pre-processing for better OCR
    # here, it's cropped to remove the app logo
    # also, we can remove the overall time with 'trail' (~150px), or set it to 0 if needed
    img = img[0:150, leftSide:(int(width*(150/height))-trail)]
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img = cv2.medianBlur(img, blurry)
    if v_type == "weekly":
        img = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
                    cv2.THRESH_BINARY,19,9)
        img = cv2.medianBlur(img, 5)
    else: 
        ret, img = cv2.threshold(img, black_point, 255, cv2.THRESH_BINARY)
    ## texter = pytesseract.image_to_string(img)
    datum = pytesseract.image_to_data(
        img, output_type=pytesseract.Output.DATAFRAME)

    #img = cv2.medianBlur(img,3)
    return datum

# A function that takes the image with 'activity' (time)
## and transforms that into text (string)
def activ_to_text(img):
    height, width, channels = img.shape
    if ph1 < 120:
        img = cv2.bitwise_not(img)
    # resizing if the image is too small
    img = cv2.resize(img, (int(width*(150/height)), 150),
                     interpolation=cv2.INTER_LANCZOS4)
    # pre-processing for better OCR
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    img = cv2.medianBlur(img, 3)
    ret, img = cv2.threshold(img, black_point, 255, cv2.THRESH_BINARY)
    # detecting the text
    texter = pytesseract.image_to_string(img)
    for rep in replacements:
        # replacing the common mistakes
        texter = texter.replace(rep[0], rep[1])
    return texter

# processing notification and phone pick up apps
def process_week_app(images):
    week_apps = []
    for image in images:
        s = app_to_text(image,130, 3, 0, video_type)
        s = s.dropna()

        if len(s) < 2:
            continue
        
        week_app = []
        app_name = []
        app_name_conf = []
        app_time = []

        for index, row in s.iterrows():
            # for the name, it's the first block and line
            if row["block_num"] == 1 and row["line_num"] == 1:
                app_name.append(row["text"])
                # Also getting OCR confidence for the name
                app_name_conf.append(row["conf"])
        # If there's nothing, go to the next one        
        if len(app_name) == 0:
            continue
            
        app_name_conf = stats.mean(app_name_conf)
        # If unsure, go to the next one
        if app_name_conf < 90:
            continue
        if s.iloc[-1, -2] < 60:
            continue

        app_name = " ".join(app_name)
        # Getting the number from the last row of the df
        app_time = s.iloc[-1, -1]
        week_app = [app_name, app_time]
        if len(week_apps) == 0:
            week_apps.append(week_app)
        else:
            counter = 0
            for ii in week_apps:
                # check if the app is already in the list
                if fuzz.ratio(app_name, ii[0]) > 75:
                    counter += 1
                    break
            if counter == 0:
                week_apps.append(week_app)
            else:
                continue
    return week_apps

# Takes the dataframe, outputs the app name and the app times
# Importanly, also includes recognition confidence for the name
# Output: [(name, conf), [time, time2 if exists]]
def process_app(app_img):
    # We get a pandas dataframe
    s = app_to_text(app_img, 140, 5, 150, video_type)
    s = s.dropna()

    app_name = []
    app_name_conf = []
    app_time = []
    # Iterate over the rows of the df, to get the name and the time
    for index, row in s.iterrows():
        # for the name, it's the first block and line
        if row["block_num"] == 1 and row["line_num"] == 1:
            app_name.append(row["text"])
            # Also getting OCR confidence for the name
            app_name_conf.append(row["conf"])
        else:
            # Everything else is time by exclusion
            # Fixing commong digit recognition mistakes,
            # so only taking words that are probably time-indicators
            if len(row["text"]) < 4:
                temp_app = ""
                for rep in digit_repl:
                    temp_app = row["text"].replace(rep[0], rep[1])
                app_time.append(temp_app)
            else:
                app_time.append(row["text"])
    if len(app_name_conf) == 0:
        return [("", 0), 0]

    app_name_conf = stats.mean(app_name_conf)
    app_name = " ".join(app_name)
    app_time = " ".join(app_time)

    for rep in app_replacements:
        # Replacing the common mistakes
        app_time = app_time.replace(rep[0], rep[1])
    # Splitting the on-screen and bg times
    app_time = app_time.split("-")
    app_tuple = (app_name, app_name_conf)
    return [app_tuple, app_time]

# Only keeps the unique and healthy apps per list (hour)
def apps_this_hour(hour):
    this_hour = []
    # start counting from the second element, since the first is 'activity'
    for i in range(1, len(hour)):
        app = process_app(hour[i][1])
        # Only consider apps with confidence above 90
        if app[0][1] < 90:
            continue
        # If the name is empty, or if something is wrong with the times, skip
        elif len(app[0][0].strip()) == 0 or len(app[1][0]) > 13:
            continue
        else:
            # append the app if it's the first one
            if len(this_hour) == 0:
                this_hour.append(app)
                continue
            counter = 1
            for ii in this_hour:
                # check if the app is already in the list
                if fuzz.ratio(app[0][0], ii[0][0]) > 75:
                    counter = 0
                    break
            # if it is not, append it
            if counter > 0:
                this_hour.append(app)
    return this_hour

# Using the extracted key frames and the coordinates of the bounding boxes
# Format is: first layer - hours
# second layer: first element is time, all subsequent - apps
# third layer: first element is the label, second - app image
# output[i][0][1] - time; output[i][1:ii][1] - app image
def photos_to_dicts(photos, coordinates):
    sorted_dicts = []
    # take each screen and the respective box coordinates
    for i, photo in enumerate(photos):
        for part in coordinates[i]:
            # if the box coordinate is 'activity', create a new list
            if part[5] == 2:
                img = photo[part[0]:part[1], part[2]:part[3]]
                # get the text from the image
                texter = activ_to_text(img)
                # check if it looks like it should
                ##print(texter)
                var1 = fuzz.ratio(texter, "ACTIVITY (00:00-00:00)")
                var2 = fuzz.ratio(texter, "ACTIVITY BY APP (00:00-00:00)")
                sim_score = max([var1, var2])
                if sim_score > 80:
                    # append immediately if it's the first time 'activity' is encountered
                    if len(sorted_dicts) == 0:
                        sorted_dicts.append([[part[5], texter]])
                        ##print("first time hour shown")
                        continue
                    # append a new one if it's a new hour
                    if fuzz.ratio(texter, sorted_dicts[-1][0][1]) < 99:
                        sorted_dicts.append([[part[5], texter]])
                        ##print("new hour")
            # if the box coordinate is 'app', append it to the latest 'activity'
            if part[5] == 1:
                if len(sorted_dicts) > 0:
                    img = photo[part[0]:part[1], part[2]:part[3]]
                    sorted_dicts[-1].append([part[5], img])
    return sorted_dicts

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Applying the Code

In [None]:
all_dirs = os.listdir("D:/SinSu/all_videos")
outputs = {}
for index, direct in enumerate(all_dirs):
    path_to_vid = "D:/SinSu/all_videos/" + direct

    #Loading a sample of frames to determine the brightness of the image
    check_frames = vid_to_np(path_to_vid,
                               60)
    ph1 = [np.mean(x) for x in check_frames]
    ph1 = stats.mean(ph1)

    # 0.17 for dark, 0.28 for light
    ## Based on the brightness, make some adjustments
    ## Needed since different image proccesing happens for l. vs d. videos
    if ph1 < 120:
        photos = keyframeDetection(path_to_vid, 0.21)
        print("frames nr.: {}".format(len(photos)))
        black_point = 215
        print("dark video")
    else:
        photos = keyframeDetection(path_to_vid, 0.31)
        print("frames nr.: {}".format(len(photos)))
        black_point = 228
        print("light video")


    print("detecting items")
    dicts = detect_multiple_images(photos, detection_graph)

    # Now we have bounding boxes, we can get the (coordinates of)
    # the parts of the frame that correspond to them
    # We also keep the label of the box and the confidence level
    print("detection successful")
    coordinates = []
    for i, x in enumerate(dicts):
        coord = get_abs_coord(photos[i], x, 0.99)
        coordinates.append(coord)

    # Count the occurences of each label to decide what kind of video it is
    labels = [part[5] for coord in coordinates for part in coord]
    d = {x: labels.count(x) for x in labels}

    if len(d) < 2 or len(labels) < 200:
        print("failed to detect sufficient info, exiting")
        continue

    if 2 not in d or 1 not in d:
        video_type = "weekly"
    elif 3 not in d or 5 not in d:
        video_type = "daily"
    elif 7 not in d:
        video_type = "daily"
    else:
        if (d[1] + d[2]) < (
                d[4] + d[6]):
            video_type = "weekly"
        else:
            video_type = "daily"

    print("type of video: {}".format(video_type))

    # OCR
    ## Weekly Videos first
    if video_type == "weekly":
        pickup_over = []
        week_imgs = []
        for coord in coordinates:
            for part in coord:
                # Looking for the weekly indicator (e.g. "This week")
                if part[5] == 8:
                    week_imgs.append(coord)
                    break
        # Initialize the lists
        not_over = []
        not_app = []
        pick_over = []
        pick_app = []
        week_over = []
        for i, photo in enumerate(photos):
                # separate the detected boxes by label
                for part in coordinates[i]:
                    if part[5] == 3:
                        img = photo[part[0]:part[1], part[2]:part[3]]
                        not_over.append(img)
                    elif part[5] == 4:
                        img = photo[part[0]:part[1], part[2]:part[3]]
                        not_app.append(img)
                    elif part[5] == 5:
                        img = photo[part[0]:part[1], part[2]:part[3]]
                        pick_over.append(img)
                    elif part[5] == 6:
                        img = photo[part[0]:part[1], part[2]:part[3]]
                        pick_app.append(img)
                    elif part[5] == 7:
                        img = photo[part[0]:part[1], part[2]:part[3]]
                        week_over.append(img)
                    else:
                        continue
        print("getting pick ups")
        # Since they are very repetitive, only use the first 100 occurences
        ## Saves time when processing and sorting
        pickups = process_week_app(pick_app[0:100])
        print("getting notifications")
        notes = process_week_app(not_app[0:100])

        notification = []
        for image in not_over:
            s = app_to_text(image, 0, 5, 0, video_type)
            s = s.dropna()
            if len(s) == 0:
                continue
            if s.iloc[-1, -2] > 80:
                notification.append(s.iloc[-1, -1])
        notification = list(set(notification))


        for image in pick_over[0:15]:
            s = app_to_text(image, 0, 5, 0, video_type)
            s = s.dropna()
            if stats.mean(s["conf"]) > 95:
                day = (s.iloc[-3,-1], s.iloc[-2,-1])
                total = s.iloc[-1,-1]
                pickup_over.append(day)
                pickup_over.append(total)
                break
                
        #Save the results
        weekly_stats = {"app_notes": notes, "pick_ups": pickups,
                        "overall_notes": notification, "overall_pick_ups": pickup_over}

        outputs[direct] = weekly_stats
        print (index)
        print("---------------")
        

    else:

        # Common OCR mistakes and their fix
        replacements = [("-—", "-"), ("—-", "-"), ("--", "-"), ("—", "-"),
                        ("{", "("), ("}", ")"), (";", ":")]

        digit_repl = [("I", "1"), ("i", "1"), ("e", "3")]

        app_replacements = [("Im ", "1m "), ("im ", "1m"),
                            ("-—", "-"), ("—-", "-"), ("--", "-"), ("—", "-")]

        ## Get the apps into hourly lists

        print("sorting the apps by hour")
        sorted_dicts = photos_to_dicts(photos, coordinates)

        ## App Processing

        apps = {}
        for hour in sorted_dicts:
            this_hour = apps_this_hour(hour)
            apps[hour[0][1]] = this_hour

        ## Dataframe Output
        print("making a data frame")
        time_list = []
        app_list = []
        screen_list = []
        backg_list = []
        for key, values in apps.items():
            for value in values:
                time_list.append(key)
                app_list.append(value[0][0])
                if len(value[-1]) > 1:
                    screen_list.append(value[-1][0])
                    backg_list.append(value[-1][1])
                else:
                    screen_list.append(value[-1][0])
                    backg_list.append("NA")

        ss = pd.DataFrame({"time": time_list, "app": app_list,
                           "screen": screen_list, "backg": backg_list})
        ss = ss[['time', 'app', 'screen', 'backg']]

        outputs[direct] = ss
        print (index)
        print("---------------")