**Task 1: Working with Bag of Words Dataset**

Importing important and necessary libraries 

In [1]:
#Importing important and necessary libraries

import os
import random

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

%matplotlib inline



The UCI Bag of Words dataset is a collection of text documents that have been preprocessed and represented as a bag-of-words model. It is commonly used in natural language processing and machine learning tasks, such as text classification and information retrieval.

The dataset consists of a text corpus where each document is represented as a sparse vector of word frequencies. The bag-of-words model represents a document by counting the frequency of each word that appears in it, ignoring the order and structure of the text. This representation allows for efficient and straightforward analysis of text data.

The UCI Bag of Words dataset includes two files:

* vocab.txt: This file contains the vocabulary of the dataset, listing all the unique words found in the corpus. Each word is assigned a unique identifier or index.

* docword.txt: This file represents the bag-of-words representation of the documents. It contains three columns: docID, wordID, and count. Each row corresponds to an occurrence of a word in a document. The docID identifies the document, the wordID represents the index of the word in the vocabulary, and the count indicates the frequency of that word in the document.

I combined three datasets into a common corpus. Therefore, two functions called **get_bow_file** and **get_vocab_file** are created to write these datasets into pandas dataframe one-by-one.

In [46]:
# Define paths to datasets
data_paths = {
    'enron': {
        'docword': '../input/uci-bag-of-words/docword.enron.txt',
        'vocab': '../input/uci-bag-of-words/vocab.enron.txt'
    },
    'kos': {
        'docword': '../input/uci-bag-of-words/docword.kos.txt',
        'vocab': '../input/uci-bag-of-words/vocab.kos.txt'
    },
    'nips': {
        'docword': '../input/uci-bag-of-words/docword.nips.txt',
        'vocab': '../input/uci-bag-of-words/vocab.nips.txt'
    }
}

In [47]:
# Load dataset into pandas DataFrame
def load_dataset(data_paths, dataset_name, doc_limit=None):
    docword_path = data_paths[dataset_name]['docword']
    vocab_path = data_paths[dataset_name]['vocab']
    
    docword = pd.read_csv(docword_path, header=None, names=['docID', 'wordID', 'count'], skiprows=3, sep=' ')
    vocab = pd.read_csv(vocab_path, header=None, names=['word']).fillna('null')
    vocab['wordID'] = vocab.index + 1
    
    if doc_limit:
        doc_ids = random.sample(list(set(docword['docID'])), k=doc_limit)
        docword = docword[docword['docID'].isin(doc_ids)].reset_index(drop=True)
    
    return docword, vocab

In [48]:
# Load datasets
enron, enron_vocab = load_dataset(data_paths, 'enron', doc_limit=6000)
kos, kos_vocab = load_dataset(data_paths, 'kos')
nips, nips_vocab = load_dataset(data_paths, 'nips')

In [50]:
# Combine datasets into a common corpus
dfs = []
offset = 0
for df, vocab in [(enron, enron_vocab), (kos, kos_vocab), (nips, nips_vocab)]:
    ids = df['docID'] + offset
    df['new_id'] = ids
    offset = ids.max()
    df = df.merge(vocab)[['new_id', 'word', 'count']]
    dfs.append(df)

In [51]:
merged = pd.concat(dfs, ignore_index=True).rename(columns={'new_id': 'docID'})
merged_vocab = pd.DataFrame({'word': merged['word'].unique()}).reset_index().rename(columns={'index': 'wordID'})
merged = merged.merge(merged_vocab, how='left')
merged = merged[['docID', 'wordID', 'count']].sort_values(['docID', 'wordID']).reset_index(drop=True)

# Create word-document matrix
wdm = merged.pivot(index='wordID', columns='docID', values='count').fillna(0.0).astype(pd.SparseDtype("float", 0.0))

In [55]:
# Truncated SVD
svd = TruncatedSVD(n_components=100, n_iter=10, random_state=42)
y_svd = svd.fit_transform(wdm)

In [56]:
# Top dimensions analysis
wordsSVD = pd.DataFrame(y_svd, index=merged_vocab['word'])

for index in range(10):
    thrsh = wordsSVD.loc[:, index].quantile(0.9)
    print(f"Dimension: {index}")
    words = list(wordsSVD[wordsSVD[index] > thrsh].sort_values([index], ascending=False).index)[:25]
    print(words)
    print()

Dimension: 0
['network', 'model', 'learning', 'input', 'function', 'neural', 'set', 'unit', 'data', 'training', 'algorithm', 'system', 'output', 'weight', 'error', 'problem', 'result', 'number', 'method', 'parameter', 'pattern', 'neuron', 'vector', 'point', 'layer']

Dimension: 1
['company', 'power', 'energy', 'california', 'electricity', 'market', 'billion', 'davis', 'business', 'plan', 'companies', 'firm', 'stock', 'prices', 'price', 'customer', 'plant', 'cost', 'states', 'bill', 'financial', 'month', 'utility', 'investor', 'group']

Dimension: 2
['network', 'unit', 'input', 'neural', 'output', 'weight', 'layer', 'hidden', 'net', 'training', 'company', 'pattern', 'connection', 'recurrent', 'neuron', 'architecture', 'activation', 'trained', 'firm', 'propagation', 'chip', 'delay', 'fund', 'analog', 'threshold']

Dimension: 3
['model', 'neuron', 'cell', 'input', 'visual', 'system', 'response', 'signal', 'object', 'field', 'motion', 'activity', 'firing', 'direction', 'synaptic', 'stimulu

In [59]:
# Cosine similarity analysis
def calculate_cosine_similarity(data_paths, dataset_name, trimmed_doc_ids=None):
    docword_path = data_paths[dataset_name]['docword']
    docword = pd.read_csv(docword_path, header=None, names=['docID', 'wordID', 'count'], skiprows=3, sep=' ')
    
    if trimmed_doc_ids:
        docword = docword[docword['docID'].isin(trimmed_doc_ids)].reset_index(drop=True)
    
    wdm = docword.pivot(index='docID', columns='wordID', values='count').fillna(0).astype(pd.SparseDtype("int16", 0))
    sims = cosine_similarity(wdm, dense_output=False)
    return sims.mean()

average_cosine_similarities = []
average_cosine_similarities.append(("enron", calculate_cosine_similarity(data_paths, 'enron', trimmed_doc_ids=set(enron['docID']))))
average_cosine_similarities.append(("kos", calculate_cosine_similarity(data_paths, 'kos')))
average_cosine_similarities.append(("nips", calculate_cosine_similarity(data_paths, 'nips')))

doc_term_mat = merged.pivot(index='docID', columns='wordID', values='count').fillna(0).astype(pd.SparseDtype("int16", 0))
sims = cosine_similarity(doc_term_mat, dense_output=False)
average_cosine_similarities.append(("all", sims.mean()))

cosine_sims_df = pd.DataFrame(average_cosine_similarities, columns=['corpus', "average_cosine_similarity"])
cosine_sims_df

Unnamed: 0,corpus,average_cosine_similarity
0,enron,0.0256043
1,kos,0.0805451
2,nips,0.1777671
3,all,0.0269295


In [58]:
# LSA for clustering
dtm = wdm.T

lsa = make_pipeline(TruncatedSVD(n_components=100), Normalizer(copy=False))
x_lsa = lsa.fit_transform(dtm)
explained_variance = lsa[0].explained_variance_ratio_.sum()
print(f"Explained variance (LSA): {explained_variance * 100:.2f}%")

Explained variance (LSA): 56.62%


In [60]:
#using kmeans model for the clustering 
kmeans = KMeans(n_clusters=10, max_iter=100, n_init=1)
kmeans.fit(x_lsa)
labels = kmeans.labels_

dtm['label'] = labels
for cluster in range(10):
    print(f"Cluster: {cluster}")
    cluster_docs = dtm[dtm['label'] == cluster]
    top_ten_words = set(cluster_docs.sum().sort_values(ascending=False).head(10).index)
    print(merged_vocab[merged_vocab['wordID'].isin(top_ten_words)]['word'].tolist())
    print()

Cluster: 0
['data', 'set', 'word', 'hit', 'training', 'target', 'false', 'speech', 'alarm', 'fom']

Cluster: 1
['house', 'party', 'kerry', 'dean', 'senate', 'campaign', 'democratic', 'poll', 'state']

Cluster: 2
['california', 'energy', 'prices', 'company', 'market', 'power', 'cost', 'electricity', 'customer']

Cluster: 3
['team', 'meeting', 'think', 'going', 'game', 'play', 'free', 'season', 'texas']

Cluster: 4
['model', 'unit', 'function', 'input', 'network', 'output', 'learning', 'neural', 'system', 'neuron']

Cluster: 5
['company', 'market', 'business', 'firm', 'stock', 'management', 'services', 'group', 'companies']

Cluster: 6
['problem', 'model', 'data', 'set', 'function', 'network', 'training', 'error', 'learning', 'algorithm']

Cluster: 7
['november', 'house', 'governor', 'kerry', 'senate', 'bush', 'poll', 'polls', 'republicans']

Cluster: 8
['attached', 'contract', 'point', 'data', 'price', 'order', 'number', 'mid', 'page']

Cluster: 9
['general', 'president', 'kerry', 'bush

In [61]:
# PCA for clustering
tdm = merged.pivot(index='wordID', columns='docID', values='count').fillna(0.0)
pca = PCA(n_components=100)
y_pca = pca.fit_transform(tdm)

word_pca = pd.DataFrame(y_pca, index=merged_vocab['word'])

explained_variance_pca = pca.explained_variance_ratio_.sum()
print(f"Explained variance (PCA): {explained_variance_pca * 100:.2f}%")

Explained variance (PCA): 57.55%


In [63]:
for index in range(10):
    thrsh = word_pca.loc[:, index].quantile(0.9)
    print(f"Dimension: {index}")
    words = list(word_pca[word_pca[index] > thrsh].sort_values([index], ascending=False).index)[:25]
    print(words)
    print()

Dimension: 0
['network', 'model', 'learning', 'input', 'function', 'neural', 'set', 'unit', 'data', 'training', 'algorithm', 'system', 'output', 'weight', 'error', 'problem', 'result', 'number', 'method', 'parameter', 'pattern', 'neuron', 'vector', 'point', 'layer']

Dimension: 1
['company', 'power', 'energy', 'california', 'electricity', 'market', 'billion', 'davis', 'business', 'plan', 'companies', 'firm', 'stock', 'prices', 'price', 'plant', 'customer', 'cost', 'states', 'bill', 'financial', 'utility', 'month', 'investor', 'group']

Dimension: 2
['network', 'unit', 'input', 'neural', 'output', 'weight', 'layer', 'hidden', 'net', 'training', 'company', 'pattern', 'connection', 'neuron', 'recurrent', 'architecture', 'activation', 'trained', 'firm', 'propagation', 'chip', 'delay', 'analog', 'fund', 'threshold']

Dimension: 3
['model', 'neuron', 'cell', 'input', 'visual', 'system', 'response', 'signal', 'object', 'field', 'motion', 'activity', 'firing', 'direction', 'synaptic', 'stimulu