In [65]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

### Load data

In [66]:
path = './data-20220511T090048Z-001/data/'
num_files = 6359

def load_data6359(load_path):
    """
    Input: Load path of data input
    Output: An list of strings correspond to data input

    """
    corpus = []
    for i in range(1, num_files + 1):
        with open(load_path + 'news' + str(i).zfill(5) + '.txt') as f:
            contents = f.read()
            corpus.append(contents)
    return corpus

def tf_idf(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X

def vec(load_path):
    corpus = load_data6359(load_path)
    return tf_idf(corpus)

corpus = load_data6359(path)
feature_vector = tf_idf(corpus)

### Preprocessing Functions
Tokenizing, removing stop words and stemming

In [68]:
import nltk
from nltk.corpus import stopwords

def get_tokenized_list(doc_text):
    """
    Return a list tokenized and of any text 
    """
    tokens = nltk.word_tokenize(doc_text)
    return tokens

def word_stemmer(token_list):
    """
    Return a list of word stemmed on tokenized words
    """
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for word in token_list:
        stemmed.append(ps.stem(word))
    return stemmed

def get_stopwords_vietnamesse(path):
    """
    Get data that contain stopwords in Vietnamese
    """
    with open(path) as f:
        return f.read().splitlines()

def remove_stopwords(doc_text):
    """
    Return a list of word after remove stopwords
    """
    stopwords_vn = get_stopwords_vietnamesse('stopword_vn.txt')
    removed_stopwords = []
    for word in doc_text:
        if word not in stopwords_vn:
            removed_stopwords.append(word)
    return removed_stopwords

