In [48]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.cluster import contingency_matrix
import numpy as np

In [2]:
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading Dataset

In [3]:
import os
directory = 'Doc50/'
files = os.listdir(directory)
print(files)

['101725', '102616', '103117', '103118', '103119', '103120', '103121', '103122', '103123', '103124', '20361', '20362', '20363', '20364', '20365', '20487', '20488', '20489', '20490', '20491', '52550', '52551', '52552', '52553', '52554', '52555', '52556', '52557', '52558', '52559', '57110', '58043', '58044', '58045', '58046', '58047', '58048', '58049', '58050', '58051', '64830', '64831', '66189', '66322', '66398', '66399', '66400', '66401', '66402', '66403']


# PreProcessing

In [18]:
REMOVABLE_PREFIXES = ["Path:","From:","Newsgroups:","Message-ID:","Date:","Sender:","Lines:","Approved:"
                     ,"Expires:","References","In article","--","Xref:","X-Newsreader:"
                     ,"Nntp-Posting-Host:","Distribution:","Reply-To:","Article-I.D.:","Followup-To:","Supersedes:"
                     ,"Posted-Date:","Posted:","Re:","In article","In-reply-to:","X-Newsreader:","NNTP-Posting-Host:","Originator:"
                     ,"Distribution:","Lines:","Nntp-Posting-Host:","X-Newsreader:","Internet:","UCCP:"
                     ,"Article-ID:","Bitnet:","NewsFlash:"]
REPLACEABLE_TEXT = ["\n","\t",">","[","]",".","|","_","\"","\"","\'","!",")","(","=","?","\\","/","~","*",":","-",",","#","&"
                  ,"^",";","{","}","<",">","$","+","Subject:","Re:","Organization:","Summary:","Keywords:","-","_","."]
EMAIL_PATTERN = r'\S*@\S*'
lemmatizer = WordNetLemmatizer()

In [27]:
dataset = []
for file in files:
    current = []
    with open(directory + file, 'r') as f:
        for line in f:
            prefixed=False
            for text in REMOVABLE_PREFIXES:
                if re.search(text,line):
                    prefixed=True
                    break
            for text in REPLACEABLE_TEXT:
                line = line.replace(text,"  ")
            for text in stopWords:
                line = line.replace(" "+text+" "," ")
            line = re.sub(EMAIL_PATTERN, '  ', line)
            line = re.sub(r'\d+',' ', line)
            if not prefixed:
                if line.strip() != '':
                    line = line.strip()
                    current.append(" ".join(line.split()))
    dataset.append(''.join(current))
tokenizedDataset=[]
for line in dataset:
    tokens = word_tokenize(line)
    lemmatizedTokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    tokenizedDataset.append(lemmatizedTokens)

# Baseline 1 => Term Frequency Features

In [34]:
vectorizer = CountVectorizer(max_features=1000)
TFMatrix = vectorizer.fit_transform(dataset)
TFfeatureNames = vectorizer.get_feature_names()
TFMatrixArray = TFMatrix.toarray()

In [35]:
TFkmeans = KMeans(n_clusters=5, random_state=42)
TFkmeans.fit(TFMatrixArray)
TFlabels = TFkmeans.labels_
print(TFlabels)

[0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 2 0 4 0 0 0 0 0 0]


# Baseline 2 => TF-IDF Features

In [24]:
tfidf = TfidfVectorizer(min_df=0.1, max_df=0.8)
tfidfMatrix = tfidf.fit_transform(dataset).toarray()
ifidfFeatureNames = tfidf.get_feature_names()
print(tfidfMatrix.shape)

(50, 277)


In [38]:
tfidfkmeans = KMeans(n_clusters=5, random_state=42)
tfidfkmeans.fit(tfidfMatrix)
tfidfLabels = tfidfkmeans.labels_
print(tfidfLabels)

[3 3 3 1 1 3 3 1 1 3 3 4 3 4 4 3 3 2 3 3 0 3 2 2 1 3 1 4 2 3 0 3 3 2 4 0 4
 0 0 3 3 3 3 3 3 1 3 0 0 2]


In [44]:
model = Word2Vec(tokenizedDataset)
# obtain document embeddings
doc_embeddings = []
for doc in dataset:
    word_embeddings = [model.wv[word] for word in doc if word in model.wv.key_to_index]
    if word_embeddings:
        doc_embedding = np.mean(word_embeddings, axis=0)
    else:
        doc_embedding = np.zeros(model.vector_size)
    doc_embeddings.append(doc_embedding)
word2Veckmeans = KMeans(n_clusters=5, random_state=42)
word2VecClusters = word2Veckmeans.fit_predict(doc_embeddings)
word2VecLabels = word2Veckmeans.labels_
print(word2VecLabels)

[0 3 3 3 0 2 3 2 3 3 4 0 3 3 2 2 3 3 0 3 3 3 2 4 1 2 3 1 0 4 3 2 4 3 4 3 3
 3 0 0 0 0 0 0 2 0 0 4 3 4]


In [54]:
GTdirectory = 'GT/'
gtLabels = {}

# iterate over files in GT directory
for className in os.listdir(GTdirectory):
    classDir = os.path.join(GTdirectory, className)
    if os.path.isdir(classDir):
        for filename in os.listdir(classDir):
            sampleID = filename
            gtLabels[sampleID] = className

tfClusterLabels = {}
tfidfClusterLabels = {}
word2VecClusterLabels = {}
for i in range(len(TFlabels)):
    tfClusterLabels[files[i]] = "C"+str(TFlabels[i] + 1)
for i in range(len(tfidfLabels)):
    tfidfClusterLabels[files[i]] = "C"+str(tfidfLabels[i] + 1)
for i in range(len(word2VecLabels)):
    word2VecClusterLabels[files[i]] = "C"+str(word2VecLabels[i] + 1)




In [53]:
def contingency(labels_true,labels_pred):
    cm = contingency_matrix(labels_true, labels_pred)
    n_samples = len(labels_true)
    purity = 0
    for i in range(cm.shape[0]):
        purity += max(cm[i,:])
    purity /= n_samples
    return purity

In [52]:
labels_true = [gtLabels[sample_id] for sample_id in tfClusterLabels.keys()]
labels_pred = list(tfClusterLabels.values())
tfClusterPurity = contingency(labels_true,labels_pred)
labels_true = [gtLabels[sample_id] for sample_id in tfidfClusterLabels.keys()]
labels_pred = list(tfidfClusterLabels.values())
tfidfClusterPurity = contingency(labels_true,labels_pred)
labels_true = [gtLabels[sample_id] for sample_id in word2VecClusterLabels.keys()]
labels_pred = list(word2VecClusterLabels.values())
word2VecClusterPurity = contingency(labels_true,labels_pred)
print('Purity of TF Baseline = {:.3f}'.format(tfClusterPurity))
print('Purity of TF-IDF Baseline = {:.3f}'.format(tfidfClusterPurity))
print('Purity of Word2Vec Baseline = {:.3f}'.format(word2VecClusterPurity))


Purity of TF Baseline = 0.920
Purity of TF-IDF Baseline = 0.500
Purity of Word2Vec Baseline = 0.500
