# Document Clustering

---

### Libraries

In [1]:
import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.cluster import KMeans
from sklearn.cluster import Birch

import pandas as pd
import numpy as np

import file_manager as fm

### Load data from local

In [2]:
all_links = fm.get("_all_links")
tags = list(all_links.keys())
data_block = fm.get("_data_block")

### Helper Functions

In [3]:
def tokenize(raw_text: str) -> list:
	"""
	Takes raw strings, sterilazes text, uses lemmatization, tokenizes words, and returns tokens in a sorted list.
	"""
	no_space_text = " ".join(raw_text.split())  # Remove extra spaces
	no_punctuation_text = re.sub("[^0-9A-Za-z ]", "", no_space_text)  # Remove punctuation
	pure_text = "".join([i.lower() for i in no_punctuation_text])  #   To lower

	# Tokenize and remove stop words
	tokens = nltk.tokenize.word_tokenize(pure_text)
	stop_words = nltk.corpus.stopwords.words("english")
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatize and sort
	lemm = nltk.stem.WordNetLemmatizer()
	tokens = [lemm.lemmatize(token) for token in tokens if len(token) > 2]
	tokens.sort()
	return tokens

## Preprocessing

### Clean Up Raws

In [4]:
tokens = []
for key in data_block.keys():
	data_block[key]["tokens"] = tokenize(data_block[key]["raw"])
	tokens.append(" ".join(data_block[key]["tokens"]))

### Create Vocab

In [5]:
tfidf = TfidfVectorizer(max_features = 300)
dt_matrix = tfidf.fit_transform(tokens)
tfidf.get_feature_names_out()

array(['able', 'access', 'action', 'add', 'address', 'algorithm',
       'allows', 'already', 'also', 'always', 'another', 'api', 'app',
       'application', 'approach', 'argument', 'around', 'array',
       'article', 'available', 'back', 'based', 'basic', 'best', 'better',
       'block', 'blog', 'branch', 'browser', 'build', 'building', 'call',
       'called', 'case', 'change', 'check', 'class', 'click', 'code',
       'coding', 'come', 'command', 'comment', 'component', 'computer',
       'concept', 'condition', 'consider', 'content', 'control', 'copy',
       'could', 'course', 'create', 'created', 'creating', 'cs', 'data',
       'database', 'day', 'detail', 'developer', 'development',
       'dictionary', 'different', 'django', 'done', 'dont', 'easier',
       'easy', 'element', 'end', 'environment', 'error', 'etc', 'even',
       'event', 'every', 'everything', 'example', 'execution',
       'experience', 'feature', 'feel', 'file', 'find', 'first', 'folder',
       'following

#### Save Vocab

In [6]:
# df = pd.DataFrame(dt_matrix.toarray(), columns = tfidf.get_feature_names_out())
# df.to_csv("./data/df.csv")

## Models

In [7]:
kmeans = [
	KMeans(n_clusters = 3),
	KMeans(n_clusters = 4),
	KMeans(n_clusters = 5)
]

birch = [
	Birch(n_clusters = 4, threshold = 0.3),
	Birch(n_clusters = 4, threshold = 0.5),
	Birch(n_clusters = 4, threshold = 0.7)
]

### Train

In [8]:
for model in kmeans:
	model.fit(dt_matrix)

for model in birch:
	model.fit(dt_matrix)

In [27]:
def distribution(preds):
	clusters = list(range(-1, max(preds)))
	for pred in preds:
		clusters[pred] = clusters[pred] + 1
	return sorted(clusters, reverse = True)

def dist_k(kmeans):
	priorities = [score.index(min(score)) for score in kmeans.cluster_centers_.transpose().tolist()]
	return distribution(priorities)

def dist_b(birch):
	priorities = [score.index(min(score)) for score in birch.subcluster_centers_.tolist()]
	return distribution(priorities)

In [28]:
for model in kmeans:
	print(dist_k(model))

print("--------")

for model in birch:
	print(dist_b(model))

[144, 96, 60]
[143, 64, 48, 47]
[152, 50, 37, 36, 30]
--------
[1711, 180, 40, 10, 5, 3]
[1296, 162, 41, 13, 9, 4]
[825, 115, 63, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 33, 32, 31, 30, 29, 28, 27, 27, 25, 24, 23, 22, 21, 20, 20, 18, 17, 16, 15, 15, 14, 14, 12, 11, 10, 10, 9, 8, 7, 6, 5, 4]


### Test

In [11]:
kmeans_preds = [model.predict(dt_matrix) for model in kmeans]
birch_preds = [model.predict(dt_matrix) for model in birch]

for pred in kmeans_preds:
	print(distribution(pred))

print("--------")

for pred in birch_preds:
	print(distribution(pred))

[1306, 813, 528]
[935, 873, 440, 401]
[1084, 610, 471, 341, 146]
--------
[1427, 744, 417, 61]
[1196, 642, 414, 397]
[1087, 872, 358, 332]


In [12]:
def stats()-> None:
	distribution = sorted([len(all_links[tag]) for tag in all_links.keys()], reverse = True)
	print(distribution)
	print(sum(distribution))
stats()

[1182, 792, 554, 147]
2675


In [42]:
def x(tokens):
	num = 0
	for token in tokens:
		if token.count("framework") and token.count("python"):
			num = num + 1
	print(num)
x(tokens)

121
