In [22]:
import numpy as np
from nltk.stem import PorterStemmer
import re
import os

In [23]:
def preprocessing(document):
    tokens = document.split() 
    lowercase_tokens = [token.lower() for token in tokens] 

    stopwords = set()
    with open('./stopwords.txt', 'r', encoding='utf-8') as stopword_file:
        stopwords = set(stopword_file.read().splitlines())
    filtered_tokens = [token for token in lowercase_tokens if token not in stopwords]

    filtered_tokens_without_endings = [re.sub(r'[,.!?"@()%`\':;{}$&*-]+', '', token) for token in filtered_tokens]
    filtered_tokens_without_endings = [token for token in filtered_tokens_without_endings if token != '']

    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens_without_endings]
    stemmed_tokens = [token for token in stemmed_tokens if token not in stopwords]

    tokens = [token for token in stemmed_tokens if not token.isdigit() and len(token) > 1]
    
    return tokens

In [24]:
def tokenize(documents):
    dictionary_tokens = dict()
    for document in documents:
        stemmed_tokens = preprocessing(document)

        stemmed_tokens = list(set(stemmed_tokens))

        # count df
        for word in stemmed_tokens:
            if word in dictionary_tokens:
                dictionary_tokens[word] += 1
            else:
                dictionary_tokens[word] = 1
    
    sorted_dictionary = {k: v for k, v in sorted(dictionary_tokens.items())}
    
    return sorted_dictionary

In [25]:
def calculate_tf_idf(document, dictionary, i):
    tokens = preprocessing(document)
    tf = dict()
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    # calculate tf-idf
    tf_idf = dict()
    for token in tf:
        tf_idf[token] = tf[token] * np.log10(1095/dictionary[token])

    # calculate unit vector
    tf_idf_length = np.linalg.norm(list(tf_idf.values()))
    tf_idf_unit_vector = {token: tf_idf_value / tf_idf_length for token, tf_idf_value in tf_idf.items()}

    if not os.path.exists('./output'):
        os.makedirs('./output')

    with open(f"./output/{i}.txt", 'w', encoding='utf-8') as file:
        file.write(str(len(tf_idf_unit_vector)) + '\n')
        file.write("{:<10} {:<20} \n".format("t_index", "tf-idf"))
        for index, token in enumerate(dictionary):
            if token in tf_idf:
                file.write("{:<10} {:<20} \n".format(index + 1, tf_idf_unit_vector[token]))

    return tf_idf_unit_vector

In [26]:
def consine_similarity(d1, d2, tf_idf_matrix):
    unit_vector_d1 = tf_idf_matrix[d1]
    unit_vector_d2 = tf_idf_matrix[d2]

    dot_product = sum(unit_vector_d1[token] * unit_vector_d2[token] for token in unit_vector_d1 if token in unit_vector_d2)
    
    norm_x = np.linalg.norm(list(unit_vector_d1.values()))
    norm_y = np.linalg.norm(list(unit_vector_d2.values()))
    
    similarity = dot_product / (norm_x * norm_y)

    return similarity

In [27]:
def hac_single_link(C, K):
    clusters = {i: [i] for i in range(1, len(C))}
    remaining_clusters = set(clusters.keys())

    while len(remaining_clusters) > K:
        max_similarity = -1
        to_merge = (None, None)

        for i in remaining_clusters:
            for j in remaining_clusters:
                if i != j and C[i][j] > max_similarity:
                    max_similarity = C[i][j]
                    to_merge = (i, j)

        i, j = to_merge
        clusters[i].extend(clusters[j])
        del clusters[j]
        remaining_clusters.remove(j)

        for k in remaining_clusters:
            if k != i:
                C[i][k] = C[k][i] = max(C[i][k], C[j][k])
                C[j][k] = C[k][j] = -1

    return clusters

In [28]:
document_number = 1095

documents = []

# read all documents
for i in range(1, document_number + 1):
    with open(f"./data/{i}.txt", "r", encoding="utf-8") as file:
        text = file.read()
        documents.append(text)

# tokenize all documents
dictionary = tokenize(documents)

# calculate tf-idf for all documents
tf_idf_matrix = []
tf_idf_matrix.append({})
for i in range(1, 1096):
    tf_idf_matrix.append(calculate_tf_idf(documents[i - 1], dictionary, i))

C = np.zeros((document_number + 1, document_number + 1))

for i in range(1, document_number + 1):
    for j in range(1, document_number + 1):
        C[i][j] = consine_similarity(i, j, tf_idf_matrix)

K_values = [8, 13, 20]
clusters_results = {}

for K in K_values:
    clusters_results = hac_single_link(C, K)
    with open(f'./{K}.txt', 'w') as file:
        for cluster in clusters_results.values():
            for document in cluster:
                file.write(f'{document} \n')
            file.write('\n')