In [19]:
import glob, os
import numpy as np
import pandas as pd
import re
from pyvi import ViTokenizer

In [20]:
def get_stopwords_list(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return list(frozenset(stop_set))
    
stopwords = get_stopwords_list('vietnamese_stopwords_dash.txt')
stopwords[:10]

['hết_nói',
 'tính_phỏng',
 'sang_năm',
 'phỏng_theo',
 'phía_bạn',
 'chu_cha',
 'lúc_ấy',
 'quá',
 'căn',
 'thương_ôi']

In [21]:
lookup_dist = {
    'k': ' ngàn_đồng',
    'ko': 'không',
    'h': ' giờ',
    'j': 'gì',
    'bt': 'bình thường',
    'đc': 'được',
    'ng': 'người',
    'qá': 'quá',
    'kiu': 'kêu',
    'nv': 'nhân viên',
    'vs': 'với',
    'cf': 'cà_phê',
    'sp': 'sản phẩm'
}

def text_standardization(text):
    standard_text = []
    for word in text.split():
        standard_text.append(lookup_dist[word] if word in lookup_dist else word)
    return " ".join(standard_text)

In [22]:
def preprocessing_pipeline(text):
    # 1. lowercasing & punctuation removal
    text = re.sub('[^\w\s]', '', text.lower())
    text = re.sub('[/\r?\n|\r/]', ' ', text)
    # phát hiện từ ghép
    text = ViTokenizer.tokenize(text)
    # 2. stop words removal
    text =  " ".join(x for x in text.split() if x not in stopwords)
    # 3. text standardization
    text = text_standardization(text)
    # 4. lemmatization (TO_DO)
    
    # 5. tokenization
    # token = nltk.word_tokenize(text)
    # 6. exploring text data
    # freq_dist = nltk.FreqDist(token)
    # freq_dist.plot(30)

    return text

In [23]:
def news_preprocessing(csv_path, save_path):
    raw = pd.read_csv(csv_path)
    raw['Content'] = raw['Content'].apply(preprocessing_pipeline)
    raw['Summary'] = raw['Summary'].apply(preprocessing_pipeline)
    raw.to_csv(save_path, index=False)

In [24]:
folder = './dataset/04-08'
save_folder = './clean_dataset/04-08'
for path in glob.glob(os.path.join(folder, "*.csv")):
    save_path = os.path.join(save_folder, path.split('\\')[-1])
    print(save_path)
    news_preprocessing(path, save_path)

./clean_dataset/04-08\tuoitre_congnghe.csv
./clean_dataset/04-08\tuoitre_dulich.csv
./clean_dataset/04-08\tuoitre_giaitri.csv
./clean_dataset/04-08\tuoitre_giaoduc.csv
./clean_dataset/04-08\tuoitre_kinhdoanh.csv
./clean_dataset/04-08\tuoitre_thethao.csv
./clean_dataset/04-08\tuoitre_vanhoa.csv
