# Document Clustering

---

### Libraries

In [25]:
import nltk
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import KMeans
from sklearn.cluster import Birch

import pandas as pd
import numpy as np

import file_manager as fm

### Load data from local

In [4]:
all_links = fm.get("_all_links")
tags = list(all_links.keys())
data_block = fm.get("_data_block")

### Helper Functions

In [7]:
def tokenize(raw_text: str) -> list:
	"""
	Takes raw strings, sterilazes text, uses lemmatization, tokenizes words, and returns tokens in a sorted list.
	"""
	no_space_text = " ".join(raw_text.split())  # Remove extra spaces
	no_punctuation_text = re.sub("[^0-9A-Za-z ]", "", no_space_text)  # Remove punctuation
	pure_text = "".join([i.lower() for i in no_punctuation_text])  #   To lower

	# Tokenize and remove stop words
	tokens = nltk.tokenize.word_tokenize(pure_text)
	stop_words = nltk.corpus.stopwords.words("english")
	tokens = [token for token in tokens if token not in stop_words]

	# Lemmatize and sort
	lemm = nltk.stem.WordNetLemmatizer()
	tokens = [lemm.lemmatize(token) for token in tokens if len(token) > 2]
	tokens.sort()
	return tokens

## Preprocessing

### Clean Up Raws

In [9]:
tokens = []
for key in data_block.keys():
	data_block[key]["tokens"] = tokenize(data_block[key]["raw"])
	tokens.append(" ".join(data_block[key]["tokens"]))

### Create Vocab

In [31]:
tfidf = TfidfVectorizer(max_features=200)
dt_matrix = tfidf.fit_transform(tokens)
tfidf.get_feature_names_out()

array(['able', 'access', 'add', 'algorithm', 'also', 'always', 'another',
       'api', 'app', 'application', 'array', 'article', 'based', 'basic',
       'best', 'better', 'block', 'blog', 'browser', 'build', 'call',
       'called', 'case', 'change', 'check', 'class', 'code', 'coding',
       'come', 'command', 'comment', 'component', 'computer', 'concept',
       'copy', 'could', 'create', 'created', 'creating', 'data',
       'database', 'day', 'developer', 'development', 'dictionary',
       'different', 'django', 'done', 'dont', 'easy', 'element', 'end',
       'environment', 'error', 'etc', 'even', 'event', 'every', 'example',
       'feature', 'file', 'find', 'first', 'following', 'framework',
       'function', 'game', 'get', 'git', 'github', 'give', 'given',
       'going', 'good', 'great', 'help', 'hope', 'however', 'html',
       'image', 'important', 'information', 'input', 'inside', 'install',
       'item', 'javascript', 'keep', 'key', 'know', 'language', 'learn',
      

#### Save Vocab

In [33]:
df = pd.DataFrame(dt_matrix.toarray(), columns = tfidf.get_feature_names_out())
df.to_csv("./data/df.csv")

### Models

In [None]:
kmeans = KMeans()
birch = Birch()

### Define Analysis

### Train & Test