In [None]:
# Dependencies.
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

In [None]:
# Load file created in parsing.ipynb.
clusteringDf = pd.read_csv("clusteringRaw.csv",encoding="utf8",index_col=0)

In [None]:
# SpaCy model. To download, run python -m spacy download en_core_web_lg.
nlp = spacy.load("en_core_web_lg")

In [None]:
# SpaCy preprocessing. Since I am using tfidf for clustering, I will remove stop words and lemmatize.
texts = []
for doc in clusteringDf['document']:
    docRaw = nlp(doc)
    docProcessed = []
    for token in docRaw:
        if not token.is_stop and token.pos_ not in ["PUNCT","PART","CONJ","CCONJ", "SPACE"]:
            docProcessed.append(token.lemma_)
    texts.append(" ".join(docProcessed))
clusteringDf['document2'] = texts

In [None]:
# Tfidf vectorization with unigrams.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clusteringDf['document2'])

In [None]:
# 101 clusters was determined by elbow method (intetia vs clusters, find k with greatest negative second derivative) and manually checking the resulting clusters.
km = KMeans(n_clusters=101)
km.fit(X)

In [None]:
# Transform the input space to get cluster-distance, used to determine match metric.
XTrans = km.transform(X)

In [None]:
# Build a dataframe out of clustering results to manually investigate what each cluster represents.
cols = []
for i in range(101):
    cols.append(f"{i}")
clustersDf = pd.DataFrame(columns=cols)
# Find 10 closest documents to each cluster center, extract the category and category id.
for i in range(10):
    row = []
    row.extend(clusteringDf['category'][np.argsort(XTrans,axis=0)[i,:]].tolist())
    clustersDf = clustersDf.append(pd.Series(row,index=cols),ignore_index=True)
for i in range(10):
    row2 = []
    row2.extend(clusteringDf['categoryID'][np.argsort(XTrans,axis=0)[i,:]].tolist())
    clustersDf = clustersDf.append(pd.Series(row2,index=cols),ignore_index=True)
# Find 10 highest coefficient terms of each cluster (see which words represent the cluster).
for i in range(10):
    row3 = []
    row3.extend(np.array(vectorizer.get_feature_names())[np.argsort(km.cluster_centers_,axis=1)[:,-i-1]].tolist())
    clustersDf = clustersDf.append(pd.Series(row3,index=cols),ignore_index=True)
# Also show the 10 coefficients themselves (higher values = more specialized cluster).
for i in range(10):
    row4 = []
    row4.extend(np.sort(km.cluster_centers_,axis=1)[:,-i-1])
    clustersDf = clustersDf.append(pd.Series(row4,index=cols),ignore_index=True)
clustersDf.to_csv("clusters.csv")

In [None]:
# After exporting clustersDf to csv, I manually assigned a category and subcategory to each cluster, creating rows named CATEGORY and SUBCATEGORY.
# If the category was hyperspecific (ex: centered around people named George) or seemingly random, I assigned both rows to "REMOVE".
# After that is done, save the clusters file under clustersNamed and reload. Then remove the affected columns, save under clustersCut and reload.
clustersNamedDf = pd.read_csv("clustersNamed.csv",encoding="utf8",index_col=0)
clustersCutDf = pd.read_csv("clustersCut.csv",encoding="utf8",index_col=0)

In [None]:
# Remove "REMOVE" columns from the cluster-distance space so that it can be sorted and documents can be reassigned to one of the remaining clusters.
XTransCut = np.delete(XTrans,[int(col) for col in clustersNamedDf.columns if clustersNamedDf[col]["CATEGORY"] == "REMOVE"],1)

In [None]:
# Give each document/category its clustering determined category, subcategory, and match metric. Assigned category is saved in the kmeans class, however
# it must be manually calculated using XTransCut if it was initially assigned to the "REMOVE" category.
# I believe the XTransCut and clustersCutDf process can be replaced by using np.argsort and taking the second argument if the first results in "REMOVE".
categoryList = []
subcategoryList = []
matchList = []
for i in range(len(clusteringDf)):
    if clustersNamedDf[str(km.labels_[i])]['CATEGORY'] == "REMOVE":
        categoryList.append(cutClustersDf[str(np.argmin(XTransCut[i,:]))]["CATEGORY"])
        subcategoryList.append(cutClustersDf[str(np.argmin(XTransCut[i,:]))]["SUBCATEGORY"])
        matchList.append((np.max(XTransCut[:,np.argmin(XTransCut[i,:])])-XTransCut[i,np.argmin(XTransCut[i,:])])/(np.max(XTransCut[:,np.argmin(XTransCut[i,:])])-np.min(XTransCut[:,np.argmin(XTransCut[i,:])])))
    else:
        categoryList.append(clustersNamedDf[str(km.labels_[i])]['CATEGORY'])
        subcategoryList.append(clustersNamedDf[str(km.labels_[i])]['SUBCATEGORY'])
        matchList.append((np.max(XTrans[:,km.labels_[i]])-XTrans[i,km.labels_[i]])/(np.max(XTrans[:,km.labels_[i]])-np.min(XTrans[:,km.labels_[i]])))
clusteringDf['categoryAssigned'] = categoryList
clusteringDf['subcategoryAssigned'] = subcategoryList
clusteringDf['match'] = matchList
clusteringDf.to_csv("clusteringResults.csv")

In [None]:
# Load clue data file for the next cell.
clueDf = pd.read_csv("clueRaw.csv",encoding="utf8",index_col=0)

In [None]:
# Add the clustering determined category, subcategory, and match to the clue data file for now.
# This is done for convienence for now, but is removed in normalization.ipynb.
clueCategoryList = []
clueSubcategoryList = []
clueMatchList = []
for i in clueDf['categoryID'].tolist():
    clueCategoryList.append(clusteringDf[clusteringDf["categoryID"]==i]["categoryAssigned"].tolist()[0])
    clueSubcategoryList.append(clusteringDf[clusteringDf["categoryID"]==i]["subcategoryAssigned"].tolist()[0])
    clueMatchList.append(clusteringDf[clusteringDf["categoryID"]==i]["match"].tolist()[0])
clueDf.to_csv("clueClustered.csv")