In [1]:
import sys
import pathlib
import pickle
from sklearn.cluster import MiniBatchKMeans, KMeans

In [2]:
sys.path.insert(0, "src")
import data_functions
import embedding_functions
import cluster_functions
import graph_functions
import util_functions

  from tqdm.autonotebook import tqdm





In [3]:
path_name = lambda x: f"./tmp/grascco_lokal/grascco_lokal_{x}.pickle"

In [None]:
grascco_data = data_functions.DataProcessingFactory.load(
    pathlib.Path(path_name("data"))
)

In [5]:
grascco_embedding = embedding_functions.SentenceEmbeddingsFactory.load(
    pathlib.Path(path_name("data")),
    pathlib.Path(path_name("embedding"))
)

In [4]:
grascco_cluster = cluster_functions.PhraseClusterFactory.load(
    pathlib.Path(path_name("clustering")),
)

In [None]:
#_clustering_cache = pathlib.Path("./tmp/test_grascco_cluster_cache.pickle")
# grascco_cluster = cluster_functions.PhraseClusterFactory.create(
#     sentence_embeddings=grascco_embedding,
#     cache_path=_clustering_cache,
#     cache_name="grascco",
#     cluster_algorithm="kmeans",
#     # kelbow_metric="silhouette",
#     kelbow_k=(10,50),
#     kelbow_estimator="kmeans-mb",
# )

---

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
from sklearn.utils.random import sample_without_replacement
from sklearn.linear_model import LinearRegression
from umap import UMAP
from yellowbrick.cluster import kelbow_visualizer

In [None]:
def fit_regression(x_reg, y_reg, degree=3, k_min=2):
    poly = PolynomialFeatures(degree=degree)
    x_poly = poly.fit_transform(np.asarray(x_reg).reshape(-1,1))

    model = LinearRegression()
    model.fit(x_poly, np.asarray(y_reg))
    
    x_lin = np.linspace(np.asarray(x_reg).min(), np.asarray(x_reg).max(), 100)
    x_out = poly.transform(x_lin.reshape(-1,1))
    y_out = model.predict(x_out)
    x_reg = list(range(k_min)) + x_reg
    max_reg = np.asarray(x_reg)[np.argmax(y_out)]
    
    return x_lin, y_out, max_reg

In [None]:
_umap_params = {'n_neighbors': 50, 'metric':'cosine', 'min_dist':0.0,}
umap = UMAP(n_components=2, **_umap_params)
projection_umap_vis = umap.fit_transform(grascco_embedding.sentence_embeddings)
umap = UMAP(n_components=100, **_umap_params)
projection_umap_calc = umap.fit_transform(grascco_embedding.sentence_embeddings)

In [None]:
projection_umap_calc

In [None]:
n_samples = 15
sample_fraction = 25
samples = []
for i in range(n_samples):
    samples.append(sample_without_replacement(projection_umap_vis.shape[0], int(projection_umap_vis.shape[0]/sample_fraction)))
mms = MinMaxScaler().fit(projection_umap_vis)

In [None]:
i = 0
_, axs = plt.subplots(3,5, sharex=True, sharey=True)
for _c in np.random.rand(n_samples,3):
    x = i%3
    y = i%5
    axs[x,y].scatter(*mms.transform(projection_umap_vis[samples[i]]).T, s=5, color=_c)
    i += 1
plt.show()

In [None]:
k_min = 2
kelbow = []
for _sample in samples:
    _kelbow = kelbow_visualizer(
        model=MiniBatchKMeans(n_init='auto'),
        X=projection_umap_calc[_sample],
        show=False,
        k=(k_min,100),
        metric='silhouette'
    )
    kelbow.append(_kelbow)


In [None]:
kelbow_val = kelbow[2]

x_vals, y_regression, max_regression = fit_regression(kelbow_val.k_values_, kelbow_val.k_scores_, 5)
plt.scatter(np.asarray(kelbow_val.k_values_).reshape(-1,1), np.asarray(kelbow_val.k_scores_), color='blue')
plt.plot(x_vals, y_regression, color='red')
plt.title(f"Maximum: {max_regression}")
plt.show()

In [None]:
_elbow_max = []
for _kelbow in kelbow:
    x_vals, y_regression, max_regression = fit_regression(_kelbow.k_values_, _kelbow.k_scores_, 5, k_min)
    _elbow_max.append(max_regression)

In [None]:
np.median(np.asarray(_elbow_max))

In [None]:
np.average(np.asarray(_elbow_max))

In [None]:
np.mean(np.asarray(_elbow_max))

In [None]:
_clustering_cache = pathlib.Path("./tmp/grascco_lokal")
grascco_cluster = cluster_functions.PhraseClusterFactory.create(
    sentence_embeddings=grascco_embedding,
    cache_path=_clustering_cache,
    cache_name="grascco_lokal",
    cluster_by_down_scale=False,
    cluster_algorithm="kmeans",
    cluster_n_clusters=int(np.average(_elbow_max)),
    scaling_n_neighbors=50,
    scaling_metric='cosine',
    scaling_n_components=100,
    scaling_min_dist=0.0,
    kelbow_estimator=None
)

In [None]:
grascco_embedding.sentence_embeddings.shape

In [None]:
list(embedding_functions.show_top_k_for_concepts(
    grascco_cluster.concept_cluster,
    grascco_embedding
))

In [6]:
grascco_graph = cluster_functions.WordEmbeddingClustering(
    sentence_embedding_obj=grascco_embedding,
    cluster_obj=grascco_cluster
)

In [7]:
graph_build_obj = grascco_graph.create_concept_graph_clustering()

In [12]:
graphs = graph_build_obj.build_concept_graphs(
    graph_simplify_alg="significance",
    graph_unroll=False,
    graph_merge_threshold=.9,
    graph_weight_cut_off=.6,
    restrict_to_cluster=True,
)

INFO:root:Building Document Concept Matrix with following arguments:
{'cluster_distance': 0.6, 'cluster_min_size': 1, 'cluster_exclusion_ids': None, 'graph_simplify': 0.5, 'graph_simplify_alg': 'significance', 'graph_unroll': False, 'graph_sub_clustering': False, 'connection_distance': 2, 'restrict_to_cluster': True, 'filter_min_df': 1, 'filter_max_df': 1.0, 'filter_stop': [], 'graph_cosine_weight': 0.5, 'graph_merge_threshold': None, 'graph_weight_cut_off': 0.6, 'self': <cluster_functions.WordEmbeddingClustering._ConceptGraphClustering object at 0x000001E27F10C100>}
INFO:root:Building Concept Graphs... (exclusion_ids: [])



  0%|          | 0/50 [00:00<?, ?it/s][A[A[A


  8%|▊         | 4/50 [00:00<00:02, 20.39it/s][A[A[A


 14%|█▍        | 7/50 [00:00<00:02, 19.00it/s][A[A[A


 22%|██▏       | 11/50 [00:00<00:01, 25.23it/s][A[A[A


 28%|██▊       | 14/50 [00:00<00:01, 23.07it/s][A[A[A


 36%|███▌      | 18/50 [00:00<00:01, 27.63it/s][A[A[A


 44%|████▍     | 22/50 

In [None]:
with pathlib.Path(_clustering_cache / pathlib.Path("grascco_lokal_graph.pickle")).open("wb") as graphs_out:
    pickle.dump(graphs, graphs_out)

In [None]:
graphs