In [1]:
import os
import sys
import json
import re
import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import scipy

In [2]:
sys.path.append('..')

In [3]:
from config import OCR_PATH

In [4]:
ocr_dir = "../" + OCR_PATH

In [6]:
def preprocess_text(text: str):
    text = text.lower()
    reg_pattern = r'[^a-z0-9A-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễếệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸÝửữựỳỵỷỹý\s]'
    output = re.sub(reg_pattern, '', text)
    output = output.strip()
    output = " ".join(output.split())
    return output

In [10]:
context = []
for ocr_file in tqdm(os.listdir(ocr_dir)):
    ocr_file_path = os.path.join(ocr_dir, ocr_file)

    with open(ocr_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    context += [preprocess_text(' '.join(sublist)) for sublist in data]

100%|██████████| 726/726 [00:02<00:00, 299.87it/s]


In [14]:
tfidf_transform = TfidfVectorizer(input = 'content', ngram_range = (1, 3), token_pattern=r"(?u)\b[\w\d]+\b")

In [15]:
context_matrix = tfidf_transform.fit_transform(context).tocsr()

In [18]:
with open('tfidf_transform_ocr.pkl', 'wb') as f:
    pickle.dump(tfidf_transform, f)
scipy.sparse.save_npz('sparse_context_matrix_ocr.npz', context_matrix)

In [3]:
from config import AUDIO_PATH

In [4]:
audio_dir = "../" + AUDIO_PATH

In [6]:
context = []
for audio_file in tqdm(os.listdir(audio_dir)):
    audio_filepath = os.path.join(audio_dir, audio_file)

    with open(audio_filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    context += [text for text in data]

100%|██████████| 726/726 [00:02<00:00, 268.79it/s]


In [9]:
tfidf_transform = TfidfVectorizer(input = 'content', ngram_range = (1, 3), token_pattern=r"(?u)\b[\w\d]+\b")

In [10]:
context_matrix = tfidf_transform.fit_transform(context).tocsr()

In [11]:
with open('tfidf_transform_audio.pkl', 'wb') as f:
    pickle.dump(tfidf_transform, f)
scipy.sparse.save_npz('sparse_context_matrix_audio.npz', context_matrix)