In [1]:
import nltk
from nltk.corpus import stopwords
import re
from striprtf.striprtf import rtf_to_text
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import os
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt

In [2]:
tagged_data = []
directory = "Dataset/Q1"

for filename in os.listdir(directory):
    if filename.endswith(".rtf"):
        with open(os.path.join(directory, filename), "rb") as file:
            rtf_text = file.read().decode("utf-8")
            text = rtf_to_text(rtf_text)
            text = re.sub(r'\[[0-9]*\]',' ',text)
            text = re.sub(r'\s+',' ',text)
            text = re.sub(r'\d+',' ',text)
            text = re.sub(r'\s+',' ',text)
            text = text.lower()
            tokens = nltk.word_tokenize(text)
            words = [word for word in tokens if word.isalpha()]
            words = [word for word in words if word not in stopwords.words("english")]
            cleaned_text = " ".join(words)

            tagged_data.append(TaggedDocument(words=cleaned_text.split(), tags=[str(len(tagged_data))]))

In [3]:
model = Doc2Vec(vector_size=10, window=5, min_count=1, dm=0, epochs=10)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [4]:
inferred_vectors = []
for vector in tagged_data:
    tokenized_doc = vector.words
    doc_vec = model.infer_vector(tokenized_doc)
    inferred_vectors.append(doc_vec)

In [None]:
linked = linkage(inferred_vectors, method='ward', metric='euclidean')

plt.figure(figsize=(10, 5))
dendrogram(linked, orientation='bottom', distance_sort='descending', show_leaf_counts=True)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Document Index')
plt.ylabel('Distance')
plt.show()

In [64]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.0999, min_samples=3)
dbscan.fit(inferred_vectors)

cluster_labels = dbscan.labels_
n_clusters_ = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)

for cluster_id in range(n_clusters_):
    print(f"Cluster {cluster_id}:")
    cluster_indices = [i for i, label in enumerate(cluster_labels) if label == cluster_id]
    print(cluster_indices)
    
outlier_count = 0
for n in cluster_labels:
    if(n==-1):
        outlier_count = outlier_count+1 

Cluster 0:
[5, 23, 54, 63, 84, 85, 91, 95, 100]
Cluster 1:
[7, 32, 33, 34, 36, 37, 38, 39, 41, 42, 49, 50, 52, 55, 56, 58, 59, 61, 62, 66, 67, 69, 70, 74, 75, 76, 77, 79, 80, 81, 82, 83]
Cluster 2:
[10, 11, 12, 13, 14, 90, 103, 104]
Cluster 3:
[18, 22, 24, 25]
Cluster 4:
[1, 28, 30]
Cluster 5:
[29, 35, 51, 57, 73]
Cluster 6:
[43, 47, 53, 98]
Cluster 7:
[48, 60, 99]
Cluster 8:
[15, 92, 94, 101]


In [63]:
outlier_count

33