In [19]:
from nltk.stem import PorterStemmer
import numpy as np
import re
import os

In [20]:
def preprocessing(document):
    tokens = document.split() 
    lowercase_tokens = [token.lower() for token in tokens] 

    stopwords = set()
    with open('./stopwords.txt', 'r', encoding='utf-8') as stopword_file:
        stopwords = set(stopword_file.read().splitlines())
    filtered_tokens = [token for token in lowercase_tokens if token not in stopwords]

    filtered_tokens_without_endings = [re.sub(r'[,.!?"@()%`\':;{}$&*-]+', '', token) for token in filtered_tokens]
    filtered_tokens_without_endings = [token for token in filtered_tokens_without_endings if token != '']

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_without_endings]
    stemmed_tokens = [token for token in stemmed_tokens if token not in stopwords]

    tokens = [token for token in stemmed_tokens if not token.isdigit() and len(token) > 1]
    
    return tokens

In [21]:
def tokenize(documents):
    dictionary_tokens = dict()
    for document in documents:
        stemmed_tokens = preprocessing(document)

        stemmed_tokens = list(set(stemmed_tokens))

        # count df
        for word in stemmed_tokens:
            if word in dictionary_tokens:
                dictionary_tokens[word] += 1
            else:
                dictionary_tokens[word] = 1
    
    sorted_dictionary = {k: v for k, v in sorted(dictionary_tokens.items())}

    with open('dictionary.txt', 'w', encoding='utf-8') as file:
        file.write("{:<10} {:<20} {}\n".format("t_index", "term", "df"))
        for index, token in enumerate(sorted_dictionary):
            file.write("{:<10} {:<20} {}\n".format(index + 1, token, sorted_dictionary[token]))

    return sorted_dictionary

In [22]:
def calculate_tf_idf(document, dictionary, i):
    tokens = preprocessing(document)
    tf = dict()
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    # calculate tf-idf
    tf_idf = dict()
    for token in tf:
        tf_idf[token] = tf[token] * np.log10(1095/dictionary[token])

    # calculate unit vector
    tf_idf_length = np.linalg.norm(list(tf_idf.values()))
    tf_idf_unit_vector = {token: tf_idf_value / tf_idf_length for token, tf_idf_value in tf_idf.items()}

    if not os.path.exists('./output'):
        os.makedirs('./output')

    with open(f"./output/{i}.txt", 'w', encoding='utf-8') as file:
        file.write(str(len(tf_idf_unit_vector)) + '\n')
        file.write("{:<10} {:<20} \n".format("t_index", "tf-idf"))
        for index, token in enumerate(dictionary):
            if token in tf_idf:
                file.write("{:<10} {:<20} \n".format(index + 1, tf_idf_unit_vector[token]))

    return tf_idf_unit_vector

In [23]:
def cosine(tf_idf_unit_vector_1, tf_idf_unit_vector_2):
    # calculate cosine similarity
    dot_product = sum(tf_idf_unit_vector_1[token] * tf_idf_unit_vector_2[token] for token in tf_idf_unit_vector_1 if token in tf_idf_unit_vector_2)
    
    norm_x = np.linalg.norm(list(tf_idf_unit_vector_1.values()))
    norm_y = np.linalg.norm(list(tf_idf_unit_vector_2.values()))
    
    similarity = dot_product / (norm_x * norm_y)
    
    return similarity

In [24]:
documents = []

# read all documents
for i in range(1, 1096):
    with open(f"./data/{i}.txt", "r", encoding="utf-8") as file:
        text = file.read()
        documents.append(text)

# tokenize all documents
dictionary = tokenize(documents)

# calculate tf-idf for all documents
tf_idf_matrix = []
for i in range(1, 1096):
    tf_idf_matrix.append(calculate_tf_idf(documents[i - 1], dictionary, i))

# calculate cosine similarity between document 1 and 2
consine_result = cosine(tf_idf_matrix[0], tf_idf_matrix[1])
print(consine_result)

0.19759255157392572
