In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# # pafy, youtube-dl 모델 설치
# !pip install git+https://github.com/Cupcakus/pafy
# !pip install youtube-dl

# pafy, youtube-dl 모델 설치
!pip install pafy
!pip install youtube-dl==2020.12.2

In [None]:
%cd /content/drive/MyDrive/python

import pafy

In [None]:
# easyocr 모델 설치
!pip install easyocr

# py-hanspell 라이브러리 설치
!pip install git+https://github.com/ssut/py-hanspell.git

!pip install imutils opencv-python scikit-image

import pafy
import easyocr
import cv2
import math
from hanspell import spell_checker
import pandas as pd
import numpy as np
import re
from skimage.metrics import structural_similarity as ssim

import matplotlib.pyplot as plt

In [None]:
## 비디오 타임스탬프 및 텍스트 추출
def video_to_text(url) :

    video = pafy.new(url)
    video_path = video.getbest(preftype="mp4")

    frame_images=[]
    vidcap = cv2.VideoCapture(video_path.url)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    timestamps = [vidcap.get(cv2.CAP_PROP_POS_MSEC)]
    calc_timestamps = [0.0]

    count=0
    success=True

    # 1) 비디오 프레임 추출
    while(vidcap.isOpened()):
        vidcap.set(cv2.CAP_PROP_POS_MSEC, (count*1000))
        success, image = vidcap.retrieve()

        # print('{}.sec reading a new frame: {}'.format(count, success))

        frame_exists, curr_frame = vidcap.read()
        if frame_exists:
            timestamps.append(vidcap.get(cv2.CAP_PROP_POS_MSEC))
            calc_timestamps.append(calc_timestamps[-1] + 1000/fps)
        else:
            break

        # 이미지 잘라서 저장
        frame_images.append(image[65:600,:])

        # 프레임 추출을 위한 초 간격 설정
        count += 1

    vidcap.release()
    
    # 2) 프레임 타임스탬프 생성
    times = []

    for t in timestamps :
      sec = t/1000
      times.append('{}:{:02d}'.format(math.trunc(sec/60), math.ceil(sec%60)))
  

    # 3) 이미지 유사도 측정
    g_image = []
    for i in range(len(frame_images)-1):
      g_image.append(cv2.cvtColor(frame_images[i], cv2.COLOR_BGR2GRAY))

    image_index = []
    for i in range(len(g_image)-1):

      image_A = g_image[i]
      image_B = g_image[i+1]

      # ssim : 두 이미지의 휘도, 대비, 구조 비교
      (score, diff) = ssim(image_A, image_B, full=True)
      
      # 유사도 스코어 조정
      if score >= 0.975 :
        image_index.append(i+1)
      else :
        pass

    for i in sorted(image_index, reverse = True) :
      del frame_images[i]
      del times[i]
      
    # # 4) 텍스트 추출_EasyOCR
    # reader = easyocr.Reader(['ko','en'], gpu=True)
      
    # result = {}
    # for i in range(len(frame_images)):
    #   result[times[i]] = reader.readtext(frame_images[i], detail = 0, paragraph=True, batch_size = 10)
          
    # return result

In [None]:
# 이미지 자르기 확인 코드
image_crop = result[65:600,:]
plt.imshow(image_crop)
plt.show()

In [None]:
video_to_text('https://www.youtube.com/watch?v=IHd-v4zYxj0')

In [None]:
## 중복 제거 1
def duplicated(result) :

  list_ = []
  for key, val in result.items() :
    # 딕셔너리 내 리스트 문자를 합치기(합칠 때 공백을 구분자로 사용)
    cont = ' '.join(val)

    dict_result = {}
    dict_result['timestamps'] = key
    dict_result['original'] = val

    list_.append(dict_result)
    df = pd.DataFrame(list_, columns = ['timestamps', 'original'])

  # 중복제거
  df_sort = df.sort_index(ascending=False)

  row_index = []

  for i in range(len(df_sort)-1) :
    list1 = ' '.join(df_sort['original'][i]).split()
    list2 = ' '.join(df_sort['original'][i+1]).split()

    intersection = set(list1) & set(list2)  

    try :
      score = round(len(intersection) / len(list1), 2)

      # 스코어 조정을 통해 중복 제거 미세 조정
      if score >= 0.5 :
        row_index.append(i)
      else :
        pass

      df_dup = df_sort.drop(index = row_index, axis = 0)

    except :
      pass

  return df_dup

In [None]:
df_dup = duplicated(result)
df_dup

In [None]:
## 맞춤법 검사
def spell_check(df_dup) :

  # 특수문자 제거
  def cleanText(readData):
      text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', readData)
      return text

  df_dup.reset_index(inplace = True, drop=True)

  for i in range(len(df_dup)) :
    cont = ''.join(df_dup['original'][i])
    cont = cleanText(cont)

    # 스펠체크
    spell_ck = spell_checker.check(cont)

    bool_spell = spell_ck[0]

    if bool_spell == True :
      df_dup.loc[i, 'checked'] = spell_ck[2]
    else : 
      df_dup.loc[i, 'checked'] = cleanText(df_dup.loc[i, 'original'])
  
  df_check = df_dup.sort_index(ascending=False)

  # 데이터 최종 정리
  indexs = df_check[df_check['checked'] == ''].index
  df_check.drop(indexs, inplace = True)

  return df_check

In [None]:
df_check = spell_check(df_dup)
df_check

In [None]:
df_check.to_csv("./dataedu.csv")