In [5]:
import gdown

In [8]:
# !gdown "https://drive.google.com/uc?id=1DiAXSK1DQf2beiN0-aJ3nDPK_f_5-S4C"
# !gdown "https://drive.google.com/uc?id=1GRDrZjjOTfO5WohDX6Q83PgI5VMOyNlm"
# !gdown "https://drive.google.com/uc?id=1FJhT_U4FHZaz5bosEAlfy-77dYn0qir5"
# !gdown "https://drive.google.com/uc?id=1qEsOFMWxFZ4mSvEul2RuaY8JRq075bjZ"
# !gdown "https://drive.google.com/uc?id=1Ko5WdTvaxME7XP2k6pNjttoc7dgL4I-x"

# !gdown "https://drive.google.com/uc?id=1GRDrZjjOTfO5WohDX6Q83PgI5VMOyNlm"

^C


In [77]:
import re
import torch
from string import printable, punctuation
from tqdm import tqdm
import warnings

In [78]:
# !pip install!!!
class Normalizer:
    def __init__(self,
                 device='cpu',
                 jit_model='jit_s2s.pt'):
        super(Normalizer, self).__init__()

        self.device = torch.device(device)

        self.init_vocabs()

        self.model = torch.jit.load(jit_model, map_location=device)
        self.model.eval()
        self.max_len = 150

    def init_vocabs(self):
        # Initializes source and target vocabularies

        # vocabs
        rus_letters = 'абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'
        spec_symbols = '¼³№¾⅞½⅔⅓⅛⅜²'
        # numbers + eng + punctuation + space + rus
        self.src_vocab = {token: i + 5 for i, token in enumerate(printable[:-5] + rus_letters + '«»—' + spec_symbols)}
        # punctuation + space + rus
        self.tgt_vocab = {token: i + 5 for i, token in enumerate(punctuation + rus_letters + ' ' + '«»—')}

        unk = '#UNK#'
        pad = '#PAD#'
        sos = '#SOS#'
        eos = '#EOS#'
        tfo = '#TFO#'
        for i, token in enumerate([unk, pad, sos, eos, tfo]):
            self.src_vocab[token] = i
            self.tgt_vocab[token] = i

        for i, token_name in enumerate(['unk', 'pad', 'sos', 'eos', 'tfo']):
            setattr(self, '{}_index'.format(token_name), i)

        inv_src_vocab = {v: k for k, v in self.src_vocab.items()}
        self.src2tgt = {src_i: self.tgt_vocab.get(src_symb, -1) for src_i, src_symb in inv_src_vocab.items()}

    def keep_unknown(self, string):
        reg = re.compile(r'[^{}]+'.format(''.join(self.src_vocab.keys())))
        unk_list = re.findall(reg, string)

        unk_ids = [range(m.start() + 1, m.end()) for m in re.finditer(reg, string) if m.end() - m.start() > 1]
        flat_unk_ids = [i for sublist in unk_ids for i in sublist]

        upd_string = ''.join([s for i, s in enumerate(string) if i not in flat_unk_ids])
        return upd_string, unk_list

    def _norm_string(self, string):
        # Normalizes chunk

        if len(string) == 0:
            return string
        string, unk_list = self.keep_unknown(string)

        token_src_list = [self.src_vocab.get(s, self.unk_index) for s in list(string)]
        src = token_src_list + [self.eos_index] + [self.pad_index]

        src2tgt = [self.src2tgt[s] for s in src]
        src2tgt = torch.LongTensor(src2tgt).to(self.device)

        src = torch.LongTensor(src).unsqueeze(0).to(self.device)
        with torch.no_grad():
            out = self.model(src, src2tgt)
        pred_words = self.decode_words(out, unk_list)
        if len(pred_words) > 199:
            warnings.warn("Sentence {} is too long".format(string), Warning)
        return pred_words

    def norm_text(self, text):
        # Normalizes text

        # Splits sentences to small chunks with weighted length <= max_len:
        # * weighted length - estimated length of normalized sentence
        #
        # 1. Full text is splitted by "ending" symbols (\n\t?!.) to sentences;
        # 2. Long sentences additionally splitted to chunks: by spaces or just dividing too long words

        splitters = '\n\t?!'
        parts = [p for p in re.split(r'({})'.format('|\\'.join(splitters)), text) if p != '']
        norm_parts = []
        for part in tqdm(parts):
            if part in splitters:
                norm_parts.append(part)
            else:
                weighted_string = [7 if symb.isdigit() else 1 for symb in part]
                if sum(weighted_string) <= self.max_len:
                    norm_parts.append(self._norm_string(part))
                else:
                    spaces = [m.start() for m in re.finditer(' ', part)]
                    start_point = 0
                    end_point = 0
                    curr_point = 0

                    while start_point < len(part):
                        if curr_point in spaces:
                            if sum(weighted_string[start_point:curr_point]) < self.max_len:
                                end_point = curr_point + 1
                            else:
                                norm_parts.append(self._norm_string(part[start_point:end_point]))
                                start_point = end_point

                        elif sum(weighted_string[end_point:curr_point]) >= self.max_len:
                            if end_point > start_point:
                                norm_parts.append(self._norm_string(part[start_point:end_point]))
                                start_point = end_point
                            end_point = curr_point - 1
                            norm_parts.append(self._norm_string(part[start_point:end_point]))
                            start_point = end_point
                        elif curr_point == len(part):
                            norm_parts.append(self._norm_string(part[start_point:]))
                            start_point = len(part)

                        curr_point += 1
        return ''.join(norm_parts)

    def decode_words(self, pred, unk_list=None):
        if unk_list is None:
            unk_list = []
        pred = pred.cpu().numpy()
        pred_words = "".join(self.lookup_words(x=pred,
                                               vocab={i: w for w, i in self.tgt_vocab.items()},
                                               unk_list=unk_list))
        return pred_words

    def lookup_words(self, x, vocab, unk_list=None):
        if unk_list is None:
            unk_list = []
        result = []
        for i in x:
            if i == self.unk_index:
                if len(unk_list) > 0:
                    result.append(unk_list.pop(0))
                else:
                    continue
            else:
                result.append(vocab[i])
        return [str(t) for t in result]

In [79]:
# from normalizer import Normalizer
# https://github.com/snakers4/russian_stt_text_normalization
# https://towardsdatascience.com/russian-text-normalization-for-stt-and-tts-a6d8f03aaeb9

text = 'я уже 2012 раз видел белок'

norm = Normalizer()
result = norm.norm_text(text)
print(result)

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.44it/s]

я уже две тысячи двенадцатый раз видел белок





In [87]:
import csv, os
import IPython.display as ipd
import pandas as pd


In [94]:
paths = []

for i in os.listdir():
    os.chdir(i)
    paths.append(i+'/transcription.csv')
#     print(i)
    os.chdir("..")

In [103]:
# paths = []
paths = os.listdir()
paths

['Radio1_50h', 'Radio2_25h', 'Radio3_25h', 'Radio5_25h']

In [102]:
def symboltt():
# path_trans = "Radio2_25h/Radio2_25h"
    os.chdir("dataset")
    paths = os.listdir()
    name = "transcription.csv"

    for i in range(len(paths)):
        metadata = pd.read_csv(paths[i]+'/'+name)
        data_to_text = paths[i]+'/'+"data.csv"

        with open(data_to_text, 'w') as f:
            for index, row in metadata.iterrows():
                f.write(str(row['duration']) + ',' + str(row['filepath']) + ',' + str(row['text']) + '\n')
        print('end')
    os.chdir("..")

In [112]:

data = 'data.csv'
c = 0 
for i in range(len(paths)):
    with open(os.listdir()[i]+ '/' + data, 'r') as f:
        for row in f:
            c += 1
            temp = row[0:-1].split(',')
#             print(temp)
            dur =  temp[0]
            filename = temp[1]
            trans = temp[2]
#             if c == 1395:
#                 break
#             print(filename)
            nums = '0123456789'
            for i in nums:
                if i in trans:
                    print(trans)
#     ipd.Audio(filename)


In [25]:
import librosa 

In [113]:
def get_mfccs(path):
    y, sr = librosa.load(path)
    mfcc = librosa.feature.mfcc(y, sr, n_mfcc=40)
    return mfcc

# def extense(features, frames_max):
#     extensed = []
#     for i in range(len(features)):
#         if (len(features[0]) < frames_max):
#             total = frames_max - len(features[0])
#             left = total//2
#             right = total-left
#             features[i] = np.pad(features[i], pad_width=((0,0), (left, right)), mode='constant')
#         extensed.append(features[i])
#     return extensed

In [114]:
mel = get_mfccs(filename)


In [115]:
librosa.display.specshow(mel)

AttributeError: module 'librosa' has no attribute 'display'

In [55]:
#import librosa.display
from datetime import datetime
datetime.now()

datetime.datetime(2021, 10, 27, 17, 59, 37, 728338)

In [65]:
from multiprocessing import Process
def b():
    s = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
    for i in range(len(s)):
        print(datetime.now())
        print(i)

def c():
    s = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя"
    for i in range(len(s)):
        print(datetime.now())
        print(s[i])


thread1 = Process(target = b)
thread2 = Process(target = c)
thread1.start()
thread2.start()
thread1.join()
thread2.join()
        

In [58]:
from threading import Thread

def prescript(thefile, num):
    with open(thefile, 'w') as f:
        for i in range(num):
            if num > 500:
                f.write('МногоБукв\n')
            else:
                f.write('МалоБукв\n')

thread1 = Thread(target=prescript, args=('f1.txt', 200,))
thread2 = Thread(target=prescript, args=('f2.txt', 1000,))

thread1.start()
thread2.start()
thread1.join()
thread2.join()

In [68]:
from multiprocessing import Process
from time import sleep

class A:
    def call(self, count=10, sleep_time=0.5):
        for i in range(count):
            print('Working class A, i=%s' % i)
            sleep(sleep_time)


class B:
    def call(self, count=10, sleep_time=0.5):
        for i in range(count):
            print('Working class B, i=%s' % i)
            sleep(sleep_time)


if __name__ == '__main__':
    a = A().call()
    b = B().call()

    p1 = Process(target=a, kwargs={'sleep_time': 0.7})
    p2 = Process(target=b, args=(12,))
    p1.start()
    p2.start()

    p1.join()
    p2.join()

Working class A, i=0
Working class A, i=1
Working class A, i=2
Working class A, i=3
Working class A, i=4
Working class A, i=5
Working class A, i=6
Working class A, i=7
Working class A, i=8
Working class A, i=9
Working class B, i=0
Working class B, i=1
Working class B, i=2
Working class B, i=3
Working class B, i=4
Working class B, i=5
Working class B, i=6
Working class B, i=7
Working class B, i=8
Working class B, i=9


In [72]:
import threading
import os
from time import sleep

class A:
    def __call__(self, count=10, sleep_time=0.5):
        for i in range(count):
            print('Working class A, i=%s' % i)
            sleep(sleep_time)


class B:
    def __call__(self, count=10, sleep_time=0.2):
        for i in range(count):
            x = 10
#             for j in range(10):
                       # какая-то долгая операция
            print('Working class B, i=%s' % i)
if __name__ == '__main__':
    a = A()
    b = B()

    t1 = threading.Thread(target=a, kwargs={'sleep_time': 0.1})
    t2 = threading.Thread(target=b, args=(12,))
    t1.start()
    t2.start()

    t1.join()
    t2.join()

Working class A, i=0
Working class B, i=0
Working class B, i=1
Working class B, i=2
Working class B, i=3
Working class B, i=4
Working class B, i=5
Working class B, i=6
Working class B, i=7
Working class B, i=8
Working class B, i=9
Working class B, i=10
Working class B, i=11
Working class A, i=1
Working class A, i=2
Working class A, i=3
Working class A, i=4
Working class A, i=5
Working class A, i=6
Working class A, i=7
Working class A, i=8
Working class A, i=9


In [76]:
!pip3 install pyaudio

ERROR: Could not find a version that satisfies the requirement pyaudio (from versions: none)
ERROR: No matching distribution found for pyaudio
