# Coding Assignment 08: Your task is to cluster reports into 6 clusters using the k-means clustering algorithm with k = 6.

In [1]:
"""
Necessary libraries for the task
"""
import warnings
warnings.filterwarnings("ignore")

import os
import re
import sys
import time
import nltk
import spacy
import numpy as np
import pandas as pd
nltk.download('omw-1.4')
nltk.download('wordnet')
from tqdm.auto import tqdm
from nltk.corpus import wordnet
import textacy.preprocessing as tprep
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/sharminsultana/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sharminsultana/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
"""
Read the dataset
"""
file = os.getcwd() + '/JDT_Bugs_sm.csv' #location of the dataset
df = pd.read_csv(file)
df = df[['bug report','Component']].dropna() #Get rid of the irrelavant attributes
df

Unnamed: 0,bug report,Component
0,Exception thrown during reconcile Got the foll...,APT
1,GWT Bug Getting the error when trying to insta...,APT
2,Unable to load factory names from container [t...,APT
3,JdtApt headless build fails for Integrated Ext...,APT
4,[compiler][apt] Error type detection is too co...,APT
...,...,...
2638,NPE trying to externalize strings [refactoring...,UI
2639,Java Search API package has no package.html [s...,UI
2640,Rename refactoring of inner type does not upda...,UI
2641,Disable DND operation for logical packages [dn...,UI


# Phase 1: 
Data Preparation

In [3]:
"""
Clean data
"""
# ### Standard cleaning function
def clean(text):
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    # removing mentions 
    text = re.sub("@\S+", "", text)
    text = re.sub('[-%!@#$]', '', text)
    text = re.sub("@[A-Za-z0-9]+","",text)
    #Removing numerical data
    text = re.sub(r'\d+','',text)
    #Removing currencies 
    text = re.sub(r'[\$\d+\d+\$]','',text)
    #Handling all date formats
    text = re.sub(r'\d+[\.\/-]\d+[\.\/-]\d+', '', text)
    #Removing a hyperlink
    text = re.sub(r'https?:\/\/.*[\r\n]*', '',text)
    #Extracting the main domain name of a URL
#     text = re.search(r'[\.\/]+(.*)\.',text)
#     #Removing all punctuation
    text = re.sub(r'[^a-z0-9A-Z_]',' ',text)
    return text.strip()

df['bug report'] = df['bug report'].apply(clean)

In [4]:
"""
Normalize data using textacy
"""
def normalize(text):
        text = tprep.normalize.hyphenated_words(text)
        text = tprep.normalize.quotation_marks(text)
        text = tprep.normalize.unicode(text)
        text = tprep.remove.accents(text)
        return text
df['bug report'] = df['bug report'].apply(normalize)

In [5]:
"""
Use spaCy for lemmatization and also check if token is a legitimate English word 
"""
nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler","ner"])

for i, row in tqdm(df.iterrows(), total=len(df)):
    doc = nlp(str(row["bug report"])) 
    df.at[i, "lemma"] = " ".join(token.lemma_ for token in doc if wordnet.synsets(str(token)))
    ###implement wanted_words by using wordnet from nltk and this check legitimate English word
    

  0%|          | 0/2643 [00:00<?, ?it/s]

In [6]:
df

Unnamed: 0,bug report,Component,lemma
0,Exception thrown during reconcile Got the foll...,APT,exception thrown reconcile got following excep...
1,GWT Bug Getting the error when trying to insta...,APT,bug getting error trying install installing ne...
2,Unable to load factory names from container I ...,APT,unable load factory names container i have an ...
3,JdtApt headless build fails for Integrated Ext...,APT,headless build fails integrated external tool ...
4,Error type detection is too conservative Compi...,APT,error type detection is too conservative compi...
...,...,...,...
2638,NPE trying to externalize strings I got the fo...,UI,trying externalize strings i got following i t...
2639,Java Search API package has no package html I ...,UI,java search package has no package html i expo...
2640,Rename refactoring of inner type does not upda...,UI,rename inner type does not update constructor ...
2641,Disable DND operation for logical packages in ...,UI,disable operation logical packages in i i m no...


# Phase 2: 
Feature engineering (text vectorization)

In [9]:
"""
Represent each report as a vector of tf-idf values.
"""
tfidf = TfidfVectorizer(stop_words=stopwords, use_idf=True)

###TFIDF for lemmas
tfidf_lemmas = tfidf.fit_transform(df["lemma"])

X = tfidf_lemmas.toarray()


# Phase 3:
K-means implimentation: Implement k-means using cosine similarity on vector representations.

In [10]:
def get_initial_centroids(data, k, seed=None):
    '''Randomly choose k data points as initial centroids'''
    if seed is not None: # useful for obtaining consistent results
        np.random.seed(20)
    n = data.shape[0] # number of data points
        
    # Pick K indices from range [0, N).
    rand_indices = np.random.randint(0, n, k)
    centroids = data[rand_indices,:]
    
    return centroids


In [11]:
"""
Assign clusters based on cosine distance
"""
def assign_clusters(data, centroids):
    
    # Compute cosine distances between each data point and the set of centroids:
    distances_from_centroids = pairwise_distances(data, centroids, metric='cosine')
    
    # Compute number of cluster assignments for each data point:
    cluster_assignment = np.argmin(distances_from_centroids, axis = 1)
    
    return cluster_assignment

In [21]:
"""
Assign new centers for each epoch
"""
def revise_centroids(data, k, cluster_assignment):
    new_centroids = []
    for i in range(k):
        # Select all data points that belong to cluster i.
        member_data_points = data[cluster_assignment == i]
        # Compute the mean of the data points. 
        centroid = member_data_points.mean(axis = 0)
        centroid = centroid
        new_centroids.append(centroid)
    new_centroids = np.array(new_centroids)
    
    return new_centroids

In [24]:
'''This function runs k-means on given data and initial set of centroids.
   maxiter: maximum number of iterations to run.
'''
def kmeans(data, k, initial_centroids, maxiter):
    centroids = initial_centroids[:]
    prev_cluster_assignment = None

    for itr in range(maxiter):        

        # 1. Make cluster assignments using nearest centroids
        cluster_assignment = assign_clusters(data, centroids)
            
        # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster.
        centroids = revise_centroids(data, k, cluster_assignment)
            
        # Check for convergence: if none of the assignments changed, stop
        if prev_cluster_assignment is not None and (prev_cluster_assignment==cluster_assignment).all():
            break
        
        # Print number of new assignments 
        if prev_cluster_assignment is not None:
            num_changed = sum(abs(prev_cluster_assignment-cluster_assignment))

        prev_cluster_assignment = cluster_assignment[:]
        
    return centroids, cluster_assignment


# Phase 4.1: 
Run k-means on the tf-idf vectors of reports for 10 epochs 

In [25]:
k = 6 ##Randomly choose 6 data points from 6 different target

for e in range (0,10): ###10 epochs 
    print ("Epoch: ",e)
    initial_centroids = get_initial_centroids(X, k)
    
    centroids, cluster_assignment = kmeans(X, k, initial_centroids, maxiter=1000)

    print("Cluster Assignments: ",np.bincount(cluster_assignment))


Epoch:  0
Cluster Assignments:  [397 417 452 440 735 202]
Epoch:  1
Cluster Assignments:  [257 601 704 426 461 194]
Epoch:  2
Cluster Assignments:  [336 439 478 422 431 537]
Epoch:  3
Cluster Assignments:  [444 515 267 426 353 638]
Epoch:  4
Cluster Assignments:  [422 357 462 556 503 343]
Epoch:  5
Cluster Assignments:  [410 347 427 683 617 159]
Epoch:  6
Cluster Assignments:  [500 543 254 428 447 471]
Epoch:  7
Cluster Assignments:  [223 451 418 349 666 536]
Epoch:  8
Cluster Assignments:  [471 452 267 428 408 617]
Epoch:  9
Cluster Assignments:  [487 394 237 355 640 530]


# Phase 4.2: 
Is the resulted clustering reasonably consistent with the  component labels? Justify your finding.

In the resultant clusters, index[0] represents # of clusters in APT, index[1] represents clusters in Core, index [2] is for Debug, index [3] presents num of clusters in Doc. Similarly index [4] and [5] represents the number of clusters in Text and UI. In the original given dataset, there are 404 bug reports for APT, 654 in Core, 120 in Debug, 188 in Doc, 404 in Text and 873 reports in UI. It is observed from the results that number of cluster assigments is not that much satisfactory. So we can say that Cosine similarity alone is not a sufficiently good comparison function for good text clustering. And K-means clustering is not guaranteed to give the same answer every time.