## Import Libraries

In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd

## Define files

In [2]:
tr_file = 'Data/train-data.dat'
tr_lbl_file = 'Data/train-label.dat'
t_file = 'Data/test-data.dat'
t_lbl_file = 'Data/test-label.dat'
labels_file = 'Data/labels.txt'
vocab_file = 'Data/vocabs.txt'

## Functions to use

In [3]:
def read_data_per_sentence(txt_file, data_file):
    
    dictionary = {}
    data = []
    labels = []
    
    with open(txt_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            # splits the line into two values, the one before the comma (token) and the one after the comma (indx)           
            token, indx = line.split(', ')
            # create a dictionary of the form {indx: token}
            dictionary[int(indx)] = token
    
    with open(data_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            line_data = line.split(' ')
            # takes the first element of line_data by removing <> and taking only the element inside the brackets 
            # e.g. <215> results in 215 because 2,1,5 are the line_data[0][1], line_data[0][2], and line_data[0][3] respectively
            num_doc_sentences = int(line_data[0][1:-1])  
            doc_index = 1
            doc_sentences = []  # Store sentences of the document
            
            # Loop through the sentences of the document
            for i in range(num_doc_sentences):
                # takes the second element of line_data by removing <> and taking only the element inside the brackets
                num_sentence_tokens = int(line_data[doc_index][1:-1])
                doc_index += 1
                sentence = []
                
                for j in range(doc_index, num_sentence_tokens + doc_index):
                    # Take the indexes of the tokens of the sentence and find the corresponding words from the vocabulary
                    sentence.append(dictionary[int(line_data[j])])
                
                doc_index += num_sentence_tokens
                data.append(' '.join(sentence))
            

    return dictionary, data

def multilabel_to_binary(labels):
    class_frequencies = np.sum(labels, axis=0)
    most_frequent_class = np.argmax(class_frequencies)

    binary_labels = np.zeros(labels.shape[0], dtype=int)
    binary_labels[labels[:, most_frequent_class] == 1] = 1

    return most_frequent_class, binary_labels


def number_of_clusters(input_data):
    sse = {}
    
    for k in range(3, 21):
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(input_data)
        sse[k] = kmeans.inertia_
        
    # Plot the sse values
    plt.plot(range(3, 21), list(sse.values()), marker='o')
    plt.title('Elbow Method')
    plt.xlabel('Number of Clusters')
    plt.ylabel('SSE')
    plt.show()    
    
    kn = KneeLocator(x=list(sse.keys()), 
                 y=list(sse.values()), 
                 curve='convex', 
                 direction='decreasing')
    k = kn.knee 
    return k


def k_means_clustering(k, input_data):

    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(input_data)
    
    cluster_labels=kmeans.labels_
    
    return cluster_labels


def read_data_per_doc(vocab, data_file, labels_file, cluster_sentences):
    data = []
    assigned_data = []
    labels = []
    
    with open(data_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            line_data = line.split(' ')
            num_doc_sentences = int(line_data[0][1:-1])  
            doc_index = 1
            doc = ''
            sent_in_doc = ''
            cluster = []
            # loop through the sentences of the document
            for i in range(num_doc_sentences):
                # takes the second element of line_data by removing <> and taking only the element inside the brackets
                num_sentence_tokens = int(line_data[doc_index][1:-1])           
                doc_index += 1
                sent_in_doc = ''
                for i in range(doc_index, num_sentence_tokens + doc_index):
                    sent_in_doc += vocab[int(line_data[i])] + ' '
                    doc += vocab[int(line_data[i])] + ' '
                doc_index += num_sentence_tokens
                cluster.append(cluster_sentences[sent_in_doc.rstrip()])
            data.append(doc)
            assigned_data.append(cluster)
            
    with open(labels_file, 'r') as txtfile:
        lines = txtfile.read().splitlines()
        for line in lines:
            doc_labels = [int(output) for output in line.split(' ')]
            labels.append(doc_labels)            
            
    return data, assigned_data, np.int8(labels)


def docs_per_cluster(num_clusters, data):

    # Determine the total number of columns
    num_columns = num_clusters
    
    # Create an empty DataFrame with the appropriate number of columns
    column_names = [f'cluster_{i}' for i in range(num_columns)] 
    df = pd.DataFrame(columns=column_names)
    
    # Iterate over the documents and populate the DataFrame
    for doc in data:
        row = [0] * num_columns  # Initialize a row with all zeros
        for group in doc:
            row[group] += 1  # Increment the count for the corresponding group
        df.loc[len(df)] = row  # Append the row to the DataFrame
        
    return df

## Read data from files - Sentence level

In [5]:
vocab, train_sent = read_data_per_sentence(vocab_file, tr_file)
_, test_sent = read_data_per_sentence(vocab_file, t_file)

In [6]:
print('\nFirst 3 train sentences\n')
for i in range(3):
    print('{}:\t{}'.format(i, train_sent[i]))
print('\nExtracted {} sentences'.format(len(train_sent)))


First 3 train sentences

0:	rubi rail helper demo more info auto complet
1:	see new helper action
2:	null length substr locat

Extracted 149925 sentences


In [7]:
print('\nExtracted {} sentences'.format(len(test_sent)))


Extracted 73363 sentences


## Remove duplicate sentences

In [8]:
train_sent = list(set(train_sent))
test_sent = list(set(test_sent))

## Turn sentences into vectors

In [9]:
tfidf_vectorizer = TfidfVectorizer(vocabulary=None)
train_inputs = tfidf_vectorizer.fit_transform(train_sent)
test_inputs = TfidfVectorizer(vocabulary=tfidf_vectorizer.get_feature_names_out()).fit_transform(test_sent)

train_inputs.shape, test_inputs.shape

((141153, 8520), (69436, 8520))

## Cluster training sentences - kmeans

In [10]:
#k = number_of_clusters(train_inputs)
cluster_labels = k_means_clustering(17, train_inputs)

In [11]:
from collections import Counter
cluster_counts = Counter(cluster_labels)
for cluster, count in cluster_counts.items():
    print(f"Cluster {cluster}: {count} sentences")

Cluster 13: 3593 sentences
Cluster 3: 73126 sentences
Cluster 7: 2681 sentences
Cluster 0: 13932 sentences
Cluster 11: 2068 sentences
Cluster 16: 8130 sentences
Cluster 8: 7552 sentences
Cluster 15: 2985 sentences
Cluster 1: 7060 sentences
Cluster 12: 3358 sentences
Cluster 10: 2897 sentences
Cluster 9: 1579 sentences
Cluster 2: 2573 sentences
Cluster 5: 3983 sentences
Cluster 4: 2052 sentences
Cluster 6: 2257 sentences
Cluster 14: 1327 sentences


## Create a dictionary to store the sentences and its cluster

In [12]:
cluster_sentences = {}

# Iterate over each sentence and its corresponding cluster label
for sentence, cluster_label in zip(train_sent, cluster_labels):
    cluster_sentences[sentence] = cluster_label

In [13]:
cluster_sentences

{'cite work use name address titl book titl paper': 13,
 'bore wander randomli again': 3,
 'continu third word top line word ear': 3,
 'take look post june good': 7,
 'visit masterpiec steel glass design bank fox river': 0,
 'discuss mail list user discuss': 11,
 'board avail com use filter': 16,
 'offset use top bottom left right anyth': 16,
 'virginia leav fund behind': 3,
 'hitler bush repeat lie enough come accept truth': 3,
 'select design under sea': 0,
 'make code work better': 13,
 'spent five year prove easi': 3,
 'instead spirit guidelin user primari consider': 3,
 'fundament compon univers answer emin psychologist lawrenc': 3,
 'discov formerli power edit tool photo manag': 3,
 'recogn limit sens read shakespear': 3,
 'turn out symmetr key better transmit lot inform approach take': 0,
 'revis use intern identifi specif version file repositori': 16,
 'build bargain box gift certif ship info order price match': 3,
 'wider format do better one reason actual take fewer line': 8,

## Read data from files - Document level
### Represent document as a vector of clusters

In [14]:
train_doc, train_doc_clustered, train_labels = read_data_per_doc(vocab, tr_file,tr_lbl_file, cluster_sentences)

In [19]:
print('\nFirst 5 train documents\n')
for i in range(5):
    print('{}:\t{}'.format(i, train_doc[i]))
print('\nExtracted {} documents'.format(len(train_doc)))


First 5 train documents

0:	rubi rail helper demo more info auto complet see new helper action 
1:	null length substr locat exec messag messag pleas edit remov follow word content roll stone com news song previou next page good vibrat beach boy smell teen spirit nirvana want hold hand beatl hound dog elvi god know beach boy walk line johnni cash heaven led zeppelin sympathi devil roll stone river deep mountain high turner woman cri bob day buddi holli georgia mind ray charl heartbreak hotel elvi bridg over troubl water simon track tear robinson miracl messag flash five man love woman long tall salli littl richard whole jerri lee lewi california girl beach boy brand new bag jame brown whole love led zeppelin strawberri field forev beatl mysteri train elvi feel good jame brown 
2:	substr remov addit valu return function result fals progress through version final present adopt wide rang industri project spring focus around provid way manag busi object spring both comprehens modular maxim

In [20]:
print('\nFirst 5 train documents\n')
for i in range(5):
    print('{}:\t{}'.format(i, train_doc_clustered[i]))
print('\nExtracted {} documents'.format(len(train_doc_clustered)))


First 5 train documents

0:	[8, 5]
1:	[3, 3, 15, 3, 3, 3, 3, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3]
2:	[3, 3, 2, 3, 3, 3, 0, 3, 3, 8, 8, 0, 3, 8, 2, 8, 16, 3, 3, 0, 16, 0, 3, 16, 16, 3, 16, 2, 5, 16, 3]
3:	[1, 3]
4:	[2, 1, 1, 16, 2, 8, 0]

Extracted 8251 documents


## Convert our multi-label problem to a binary classification problem

In [15]:
train_labels

array([[1, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 1, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int8)

In [18]:
most_frequent_class, train_binary_labels = multilabel_to_binary(train_labels)

print('Most frequent class:',format(most_frequent_class))
print()
for i in range(10):
    print('Original: {}\tBinary: {}'.format(train_labels[i], train_binary_labels[i]))

Most frequent class: 2

Original: [1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]	Binary: 1
Original: [0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0]	Binary: 1
Original: [1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0]	Binary: 0
Original: [1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0]	Binary: 1
Original: [1 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]	Binary: 0
Original: [0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0]	Binary: 1
Original: [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0]	Binary: 0


## Modify the training dataset. Columns are now cluster numbers

In [22]:
train_doc_clustered = docs_per_cluster(17, train_doc_clustered)

In [23]:
train_doc_clustered

Unnamed: 0,cluster_0,cluster_1,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16
0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
1,0,1,0,25,0,0,1,0,0,0,0,0,0,0,0,1,0
2,4,0,3,13,0,1,0,0,4,0,0,0,0,0,0,0,6
3,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,2,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8246,0,0,0,7,0,0,0,0,0,0,2,0,0,0,0,0,0
8247,3,0,0,21,0,2,1,0,0,0,0,0,0,0,0,0,0
8248,3,3,0,17,0,0,0,0,0,1,3,0,0,3,0,0,0
8249,0,0,0,4,0,0,0,0,0,0,0,0,0,1,0,0,0


In [24]:
train_binary_labels.shape

(8251,)

In [26]:
train_binary_labels.shape

(8251,)