In [29]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

### Load data

In [30]:
path = './data-20220511T090048Z-001/data/'
num_files = 6359

def load_data6359(load_path):
    """
    Input: Load path of data input
    Output: An list of strings correspond to data input

    """
    corpus = []
    for i in range(1, num_files + 1):
        with open(load_path + 'news' + str(i).zfill(5) + '.txt', encoding="utf8") as f:
            contents = f.read()
            corpus.append(contents)
    return corpus


def tf_idf(corpus):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    return X

def cosine_similarity(vec1 , vec2):
    return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))
    
corpus = load_data6359(path)


### Preprocessing Functions
Tokenizing, removing stop words and stemming

In [31]:
import nltk

def get_tokenized_list(doc_text):
    """
    Return a list tokenized and of any text 
    """
    tokens = nltk.word_tokenize(doc_text)
    return tokens

def word_stemmer(token_list):
    """
    Return a list of word stemmed on tokenized words
    """
    ps = nltk.stem.PorterStemmer()
    stemmed = []
    for word in token_list:
        stemmed.append(ps.stem(word))
    return stemmed

def get_stopwords_vietnamesse(path):
    """
    Get data that contain stopwords in Vietnamese
    """
    with open(path, encoding="utf8") as f:
        return f.read().splitlines()

def remove_stopwords(doc_text):
    """
    Return a list of word after remove stopwords
    """
    stopwords_vn = get_stopwords_vietnamesse('stopword_vn.txt')
    removed_stopwords = []
    for word in doc_text:
        if word not in stopwords_vn:
            removed_stopwords.append(word)
    return removed_stopwords

def clean_word(text):
    """
    Return clean text after stemming and removing stopwords
    """
    token_list = get_tokenized_list(text)
    stemmed = word_stemmer(token_list)
    cleaned = remove_stopwords(stemmed)
    cleaned = ' '.join(cleaned)
    return cleaned


In [32]:
"""
Clean and save doc data

"""
# id = 0
# # clean doc data
# for text in corpus:
#     cleaned = clean_word(text)
#     id+=1 
#     file_out = open("./clean_doc_data/news" + str(id).zfill(5) + ".txt", "w")
#     file_out.write(cleaned)
    


'\nClean and save doc data\n\n'

In [33]:
data = load_data6359("./clean_doc_data/")

vectorizer = TfidfVectorizer()
docs_vector = vectorizer.fit_transform(data).toarray()




In [36]:
# Load and clean query
with open('query.txt', encoding="utf8") as f:
    query = f.read()
cleaned_query = [clean_word(query)]
query_vector = vectorizer.transform(cleaned_query).toarray()

# Calculate cosine similarity between query and documents

dtype = [('id', int), ('similarity', float)]
id_rel = []
for i in range(len(docs_vector)):
    x = cosine_similarity(query_vector, docs_vector[i,:])
    if x > 0: 
        id_rel .append((i, x))
id_rel = np.array(id_rel, dtype=dtype)


# Respond for query
num_responds = 10


respond= open("respond.txt", "w", encoding="utf8")
id_rel_sorted = np.sort(id_rel, order='similarity')[::-1]
respond.write(f"Top {num_responds} kết quả tốt nhất: \n\n")
for id in enumerate(id_rel_sorted[:num_responds]):
    respond.write(f"Rank: {id[0]+1} - Score: {id[1][1]} - Doc's name: news{id[1][0]+1}.txt \n {corpus[id[1][0]]}\n")