In [14]:
import torch

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [15]:
t = torch.load('models/deepspeech_horovod_final.pth')

In [16]:
for tt in t:
    if tt in ['state_dict','labels','optim_dict']: continue
    print(tt,':',t[tt])

version : 0.0.1
hidden_size : 800
hidden_layers : 5
rnn_type : gru
audio_conf : {'sample_rate': 16000, 'window_size': 0.02, 'window_stride': 0.01, 'window': 'hamming', 'noise_dir': None, 'noise_prob': 0.4, 'noise_levels': (0.0, 0.5)}
bidirectional : True
epoch : 18
loss_results : tensor([86.8623, 39.5394, 32.1159,  ...,  0.0000,  0.0000,  0.0000])
cer_results : tensor([38.0488, 28.6474, 25.3659,  ...,  0.0000,  0.0000,  0.0000])
wer_results : tensor([73.4151, 57.0552, 52.1472,  ...,  0.0000,  0.0000,  0.0000])


In [25]:
t['wer_results'][:10]

tensor([73.4151, 57.0552, 52.1472, 47.7505, 40.7674, 36.9872, 34.9124, 33.3775,
        31.3122, 30.9237])

In [27]:
t['wer_results'][:10].sum()/10

tensor(43.8648)

In [5]:
test = 'data/test_kspon.csv'
with open(test,'r') as f:
    test = f.read().split('\n')

In [11]:
max_l = 0
max_t = None
max_d = None
for t in test:
    t = t.split(',')[1]
    
    if t in ['/data1/seunghwan/Speech_Recognition/data/wav_KsponSpeech/KsponSpeech_343170.txt']:
        continue
    
    with open(t,'r') as f:
        d = f.read()
        
    if len(d) > max_l:
        max_l = len(d)
        max_t = t
        max_d = d

In [12]:
max_d

'그니까 그때 그때는 나름 그니까 원래 존재는 알고 있었다는 그거잖아 존재는 알고 있었던 사이잖아 그래서 약간 걔가 막 아 그래 막 계속 그냥 근데 그때도 해서 그냥 계속 그냥 할 얘기가 그냥 공부 얘기 밖에 없었어 그래서 막 걔도 막 아 막 아 힘 내고 막 힘 내고 막 이래서 그냥 맨날 맨날 똑같은 얘기 했잖아 아 뭐 끝나면 뭐 뭐 하자 같이 이런 거 있잖아 그냥 끝나면 현우랑 출사 셋 셋이서 술 한 번 먹자 이랬고 아 좋지 좋지 이랬다고 그러고 끝 끝났어'

In [13]:
max_t

'/data1/seunghwan/Speech_Recognition/data/wav_KsponSpeech/KsponSpeech_142329.txt'

# Stream

In [2]:
from streamlink.session import Streamlink
from threading import Thread
import numpy as np
import cv2
import time
import datetime
import noisereduce as nr
import librosa
import urllib
import re

# audio read할 때 뜨는 warning 숨기기
import warnings
warnings.filterwarnings('ignore')

# Requirements
# pip install noisereduce
# pip install librosa

class Stream:
    def __init__(self, url, v_size=5,sr=16000):
        self.stream = self.fetch_stream(url)
        self.url = url
        self.sr = sr # sampling rate
        
        # url마다 다른 이름 설정으로 ts 파일 간섭 방지
        self.ts_name = '_'.join(url[22:].split('/')) + '.ts'
                
        self.audio = None
        self.frame = None
        self.loud = False
        self.f_list = []  # video frame list
        self.f_size = v_size  # video frame list size
        
        # lol noise
        self.lol_noise, _ = librosa.load('lol_noise.wav',sr=self.sr)
        self.i = 0

        # stream을 못 받아오는 경우(주소 틀림 등) stream은 None
        if self.stream is not None:
            self.v_thread = Thread(target=self.fetch, args=())
            self.v_thread.daemon = True
            self.v_thread.start()

    def fetch_stream(self, url, **params):
        session = Streamlink()
        session.set_plugin_option("twitch", 'client_id', 'kimne78kx3ncx6brgo4mv6wki5h1ko')
        streams = session.streams(url, **params)
        try:
            return streams['best']

        # 주소가 잘 못 됐을 때
        except:
            return None
        
    def fetch(self):
        while True:
            try:
                with urllib.request.urlopen(self.stream.url) as response:
                    m3_u8_meta_data = response.read().decode('utf-8')
                ts_links = re.findall('https://.*.ts', m3_u8_meta_data)

                with urllib.request.urlopen(ts_links[-2]) as response, open(self.ts_name, 'wb') as out_file:
                    out_file.write(response.fp.read())
            except:
                None
            
            # frame
            try:
                video_capture = cv2.VideoCapture(self.ts_name)
                success, frame = video_capture.read()
                if success:
#                     self.save_frame('temp.jpg')
#                     cv2.imwrite("temp.jpg", frame)
                    self.frame = frame
                else:
                    self.frame = None
                    
            except (AttributeError, KeyboardInterrupt):
                None        
    
            # audio          
            try:
                self.audio, _ = librosa.load(self.ts_name,sr=self.sr)
            
                if self.audio is not None and len(self.audio) > 0:
                    self.save_audio('lol/%d_temp.wav'%self.i)

                    # noise reduce
                    self.reduced = nr.reduce_noise(audio_clip=self.audio, noise_clip=self.lol_noise, verbose=False)
                    self.save_audio('lol/%d_reduced.wav'%self.i,audio=self.reduced)

                    self.is_loud()
                    if self.loud:
                        self.save_audio('lol/%d_loud.wav'%self.i)
            except:
                None
            self.i += 1

    # audio 저장
    def save_audio(self, name,audio=None):
        if audio is None: 
            audio = self.audio
        librosa.output.write_wav(name, audio, sr=self.sr, norm=False)

    # video 저장
    def save_frame(self, name):
        cv2.imwrite(name, self.frame)

    # loud 측정
    def is_loud(self):     
        minimum = 90
        min_num = 150
        
        if self.audio is None or len(self.audio) == 0:
            return False
        
        dbs = librosa.core.amplitude_to_db(self.audio,ref=2e-5)        
        
        dbs = dbs > minimum
        dbs = dbs.sum()
        
        self.loud = dbs >= min_num

In [57]:
lol_noise = librosa.core.resample(lol_noise, 48000, 16000)

In [40]:
lol_noise, sr = librosa.load('lol_noise.wav',sr=16000)

In [13]:
len(lol_noise)

48000

In [14]:
import IPython.display as ipd
ipd.Audio(lol_noise,rate=sr) # load a local WAV file

In [3]:
url = 'https://www.twitch.tv/lol_ambition'
s = Stream(url)

In [205]:
while True:
    try:
        
        break
    except (EOFError):
        None    

In [4]:
import IPython.display as ipd

In [27]:
i = 0

In [37]:
audio, _ = librosa.load('lol/%d_reduced.wav'%i,sr=16000)
i += 1

In [38]:
ipd.Audio(audio,rate=16000) # load a local WAV file

In [30]:
print(i)

1


In [31]:
i -= 1
audio, _ = librosa.load('lol/%d_temp.wav'%i,sr=16000)
ipd.Audio(audio,rate=16000) # load a local WAV file

In [39]:
audio, _ = librosa.load('AT-cm_648484034.wav',sr=16000)
ipd.Audio(audio,rate=16000) # load a local WAV file

In [41]:
reduced = nr.reduce_noise(audio_clip=audio, noise_clip=lol_noise, verbose=False)

In [44]:
ipd.Audio('AT_reduced.wav',rate=16000) # load a local WAV file

In [43]:
librosa.output.write_wav('AT_reduced.wav', reduced, sr=16000, norm=False)

In [18]:
i += 1

# STT engine

In [20]:
import torch
from opts import add_decoder_args, add_inference_args
from utils import load_model

from data.data_loader import SpectrogramParser

class STT:
    def __init__(self,device='cuda',
                 model_path='models/deepspeech_horovod_final.pth'):
        
        self.model = load_model(device, model_path, use_half=False)                

In [21]:
stt = STT()