In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir("drive/MyDrive/project/nlp")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


음성 파일을 텍스트로 변환하기 위해 SpeechRecognition 라이브러리를 다운 받는다.

In [None]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 1.6 MB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1


In [None]:
import speech_recognition as sr
import os
from time import time, sleep
from tqdm import tqdm

무료 STT API인 구글 Web API를 사용한다. 해당 API는 긴 음성파일에 대해서 Bad Request를 응답하기 때문에 쪼개서 보내야 한다.

음성 파일을 쪼개는 방법은 여러가지가 있을 수 있지만, 일단 이번 예제의 경우 5분의 음성파일이기 때문에 1분씩 5번 보내는 방식으로 나눈다. 그러기 위해서 offset 인자를 사용한다.

In [None]:
def download(url, fname):
    os.system(f"yt-dlp -P ./data -o {fname}.%(ext)s -x --audio-format wav {url}")


def to_text(fname, adjust_for_noise=True, offset=0, duration=60):
    r = sr.Recognizer()
    data = sr.AudioFile(fname)

    with data as source:
        if adjust_for_noise:
            r.adjust_for_ambient_noise(source)

        audio = r.record(source, offset=offset, duration=duration)

    text = r.recognize_google(audio)

    return text


if __name__ == "__main__":
    fname = "challenge"
    download("https://youtu.be/0AavxeP5vgE", fname)
    with open(f"./data/{fname}_no_adjust.txt", "w") as f:
        txt = ""
        for i in tqdm(range(5)):
            txt += ' ' + to_text(f"./data/{fname}.wav", adjust_for_noise=False, offset=i*60)
            sleep(2)
        f.write(txt)

    adjust = to_text(f"./data/{fname}.wav")
    with open(f"./data/{fname}_adjust.txt", "w") as f:
        txt = ""
        for i in tqdm(range(5)):
            txt += ' ' + to_text(f"./data/{fname}.wav", adjust_for_noise=True, offset=i*60)
            sleep(3)
        f.write(txt)


100%|██████████| 5/5 [01:25<00:00, 17.17s/it]
100%|██████████| 5/5 [01:38<00:00, 19.63s/it]


KeyBERT를 이용하여 키워드를 추출한다.

In [None]:
!pip install keybert

Collecting keybert
  Downloading keybert-0.5.1.tar.gz (19 kB)
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 8.5 MB/s 
Collecting rich>=10.4.0
  Downloading rich-12.4.1-py3-none-any.whl (231 kB)
[K     |████████████████████████████████| 231 kB 47.2 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 7.9 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 58.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 49.1 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB

In [None]:
from keybert import KeyBERT

In [None]:
kw_model = KeyBERT()
txt = open("./data/challenge_no_adjust.txt", "r").read()
txt_adjust = open("./data/challenge_adjust.txt", "r").read()

len(txt.split()), len(txt_adjust.split())

(713, 718)

음성 인식이 어느정도 잘 되었음을 확인 할 수 있다.

In [None]:
" ".join(txt.split()[:20]) + "...", " ".join(txt_adjust.split()[:20]) + "..."

('from using your iPhone is a scanner to stacking your favorite home screen widgets here are 10 helpful iPhone tips...',
 'from using your iPhone is a scanner to stacking your favorite home screen widgets ur10 helpful iPhone 6 you should...')

옵션에 따라 추출되는 키워드가 많이 다른 것을 확인 할 수 있다.

In [None]:
kw_model.extract_keywords(txt, keyphrase_ngram_range=(1,1),
                              use_maxsum=True, nr_candidates=20, top_n=5)

[('messages', 0.2403),
 ('digits', 0.2945),
 ('calculator', 0.3044),
 ('tap', 0.3344),
 ('widgets', 0.4339)]

In [None]:
kw_model.extract_keywords(txt_adjust, keyphrase_ngram_range=(1,1),
                              use_maxsum=True, nr_candidates=20, top_n=5)

[('messages', 0.2643),
 ('digits', 0.3394),
 ('tap', 0.344),
 ('calculator', 0.3459),
 ('widgets', 0.4407)]

In [None]:
kw_model.extract_keywords(txt, keyphrase_ngram_range=(1,1),
                              use_mmr=True, diversity=0.7)

[('iphone', 0.5145),
 ('paste', 0.2396),
 ('widgets', 0.4339),
 ('10', 0.1571),
 ('tip', 0.2082)]

In [None]:
kw_model.extract_keywords(txt_adjust, keyphrase_ngram_range=(1,1),
                              use_mmr=True, diversity=0.7)

[('iphone', 0.5005),
 ('paste', 0.234),
 ('widgets', 0.4407),
 ('10', 0.1959),
 ('discovering', 0.1449)]

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 8.7 MB/s 
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.2
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 4.8 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 15.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 46.0 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37

In [None]:
import numpy as np
import os
import faiss
import time
from sentence_transformers import SentenceTransformer

In [None]:
txt = open("./data/challenge_no_adjust.txt", "r").read()
txt_adjust = open("./data/challenge_adjust.txt", "r").read()

In [None]:
txt

" from using your iPhone is a scanner to stacking your favorite home screen widgets here are 10 helpful iPhone tips you should know did you accidentally enter a wrong digit into the calculator to no problem just swipe left or right at the top of the screen to erase it bonus tip you can copy numbers by touching and holding the digits and then tapping copy starting an iOS 15 when someone sends you things like web links or Apple news articles and messages you can pin the contents so it's easier to find when you need it in a message thread touch and hold the item you want to pin and then tap pin pin content will be elevated in messages search the details view of a conversation and apps that support shared with you organize your home screen by stacking your widgets touch and hold an empty space on your home screen to edit it then drag One widget on top of another of the same size you can add up to 10 widgets when creating a stack tap done in the upper right corner when you're done and the w

In [None]:
words = txt.split()
data = [" ".join(words[i:i+10]) for i in range(0, len(txt.split()), 10)]
data[:5]

['from using your iPhone is a scanner to stacking your',
 'favorite home screen widgets here are 10 helpful iPhone tips',
 'you should know did you accidentally enter a wrong digit',
 'into the calculator to no problem just swipe left or',
 'right at the top of the screen to erase it']

In [None]:
len(data)

72

In [None]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
encoded_data = model.encode(data)

len(encoded_data)

72

In [None]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(data))))

faiss.write_index(index, 'abc_news')

In [None]:
def search(query):
   t = time.time()
   query_vector = model.encode([query])
   k = 5
   top_k = index.search(query_vector, k)
   print('total time: {}'.format(time.time() - t))
   return [data[_id] for _id in top_k[1].tolist()[0]]

In [None]:
query = str("iphone widget")
results = search(query)

print('results :')
for result in results:
   print('\t', result)

total time: 0.1313340663909912
results :
	 the view by tapping the top edge of your iPhone
	 about how to use your iPhone subscribe to the Apple
	 from using your iPhone is a scanner to stacking your
	 a Notes app on your iPhone in a note tap
	 of what your iPhone has to offer to learn more
