# Document Clustering

---

### Libraries

In [12]:
import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import Birch

import pandas as pd
import numpy as np

import file_manager as fm

### Load data from local

In [13]:
all_links = fm.get("_all_links")
tags = list(all_links.keys())
data_block = fm.get("_data_block")

### Helper Functions

In [14]:
def tokenize(raw_text: str) -> list:
	"""
	Takes raw strings, sterilazes text, uses lemmatization, tokenizes words, and returns tokens in a sorted list.
	"""
	no_space_text = " ".join(raw_text.split())  # Remove extra spaces
	no_punctuation_text = re.sub("[^0-9A-Za-z ]", "", no_space_text)  # Remove punctuation
	pure_text = "".join([i.lower() for i in no_punctuation_text])  #   To lower

	# Tokenize and remove stop words
	tokens = nltk.tokenize.word_tokenize(pure_text)
	stop_words = nltk.corpus.stopwords.words("english")
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatize and sort
	lemm = nltk.stem.WordNetLemmatizer()
	tokens = [lemm.lemmatize(token) for token in tokens if len(token) > 2]
	tokens.sort()
	return tokens

## Preprocessing

### Clean Up Raws

In [15]:
tokens = []
for key in data_block.keys():
	data_block[key]["tokens"] = tokenize(data_block[key]["raw"])
	tokens.append(" ".join(data_block[key]["tokens"]))

### Create Vocab

In [16]:
tfidf = TfidfVectorizer(max_features = 150)
dt_matrix = tfidf.fit_transform(tokens)
tfidf.get_feature_names_out()

array(['access', 'add', 'algorithm', 'also', 'another', 'api', 'app',
       'application', 'array', 'article', 'basic', 'better', 'blog',
       'build', 'call', 'called', 'case', 'change', 'check', 'class',
       'code', 'coding', 'come', 'command', 'component', 'concept',
       'could', 'create', 'created', 'data', 'database', 'day',
       'developer', 'development', 'different', 'django', 'dont', 'easy',
       'element', 'end', 'environment', 'error', 'even', 'event', 'every',
       'example', 'feature', 'file', 'find', 'first', 'following',
       'framework', 'function', 'get', 'git', 'github', 'give', 'going',
       'good', 'help', 'hope', 'image', 'important', 'input',
       'javascript', 'key', 'know', 'language', 'learn', 'learning',
       'let', 'library', 'like', 'line', 'list', 'look', 'loop', 'lot',
       'make', 'many', 'may', 'mean', 'memory', 'method', 'model', 'much',
       'name', 'need', 'new', 'next', 'number', 'object', 'one', 'open',
       'operation',

#### Save Vocab

In [17]:
# df = pd.DataFrame(dt_matrix.toarray(), columns = tfidf.get_feature_names_out())
# df.to_csv("./data/df.csv")

## Models

In [18]:
kmeans = [
	KMeans(n_clusters = 3),
	KMeans(n_clusters = 4),
	KMeans(n_clusters = 5)
]

birch = [
	Birch(n_clusters = 4, threshold = 0.3),
	Birch(n_clusters = 4, threshold = 0.5),
	Birch(n_clusters = 4, threshold = 0.7)
]

In [19]:
for model in kmeans:
	model.fit(dt_matrix)

for model in birch:
	model.fit(dt_matrix)

### Feature Distribution

In [20]:
def distribution(preds):
	clusters = list(range(-1, max(preds)))
	for pred in preds:
		clusters[pred] = clusters[pred] + 1
	return sorted(clusters,  reverse = True)

def dist_k(kmeans):
	priorities = [score.index(min(score)) for score in kmeans.cluster_centers_.transpose().tolist()]
	return distribution(priorities)

def dist_b(birch):
	priorities = [score.index(max(score)) for score in birch.root_.centroids_.transpose().tolist()]
	return distribution(priorities)

In [21]:
for model in kmeans:
	print(dist_k(model))

print("--------")

for model in birch:
	print(dist_b(model))

[73, 40, 37]
[55, 42, 30, 25]
[54, 37, 32, 21, 11]
--------
[91, 58]
[47, 42, 42, 41, 40, 39, 38, 38, 37, 35, 35, 33, 30, 29, 28, 28, 27, 26, 25, 25, 24, 24, 23, 23, 21, 21, 18, 18, 16, 15, 12, 11, 10, 10, 9, 7, 7, 6, 5, 3, 1, 0]
[39, 34, 32, 28, 26, 24, 22, 21, 21, 20, 20, 19, 18, 17, 15, 13, 12, 12, 11, 10, 9, 7, 7, 5, 5, 2]


### Data Distribution

In [22]:
kmeans_preds = [model.predict(dt_matrix) for model in kmeans]
birch_preds = [model.predict(dt_matrix) for model in birch]

for pred in kmeans_preds:
	print(distribution(pred))

print("--------")

for pred in birch_preds:
	print(distribution(pred))

[1511, 725, 411]
[1044, 827, 418, 360]
[1003, 738, 402, 385, 124]
--------
[1259, 687, 521, 182]
[1301, 550, 539, 259]
[917, 625, 604, 503]


In [None]:
def stats()-> None:
	distribution = sorted([len(all_links[tag]) for tag in all_links.keys()], reverse = True)
	print(distribution)
	print(sum(distribution))
stats()

In [None]:
def x(tokens):
	num = 0
	for token in tokens:
		if token.count("web") and token.count("app"):
			num = num + 1
	print(num)
	# Or
	# print(len([0 for token in tokens if token.count("web") and token.count("app")]))
x(tokens)