In [1]:
# import importlib
# importlib.reload(cluster_util)
# import cluster_util

# Notebook error analysis 

## Refining error types - Vectorization and clustering method selection


use the combined dedupped GitHub + Kaggle error dataset

### Clustering value errors

load tokenized error dataset, evalue_tokenized

In [2]:
import pandas as pd 
import cluster_util, config

df_mlerr_mlbugs_unique = pd.read_excel(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup.xlsx"))

In [5]:
err_descs = df_mlerr_mlbugs_unique['evalue_processed'].values.astype('U')
err_descs.shape[0]

3311

### 1 Vectorization

##### 2. sentence transformers

https://github.com/UKPLab/sentence-transformers

https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2

sentence to vector, dimension -384

different processing

In [6]:
# df_mlerr_mlbugs_unique['evalue_tokenized_transformer'] = df_mlerr_mlbugs_unique['evalue'].apply(cluster_util.preprocess_text_transformer)

err_descs1 = df_mlerr_mlbugs_unique['evalue_processed'].values.astype('U')
X_transformers = cluster_util.vectorizer_sentence2vec(err_descs1)

import numpy as np

with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_embeddings_transformers.txt"), 'wb') as f:
    np.save(f, X_transformers)

##### 3. fine-tuned subword embeddings

Turn sentences to vectors via word embeddings by taking the mean/sum of all word embeddings of the sentence

the finetuned subword embeddings using gensim.models.fasttext of "wiki.en.bin" pretrained model. dimension 300

In [7]:
import numpy as np
import retrain_word2vec,config

w2v_model = retrain_word2vec.load_word2vec(config.path_w2v_models, "nberr_subword2vec_finetune.model")
X_wordemb = np.array([cluster_util.vectorizer_word2vec(xi, w2v_model.wv, w2v_model.vector_size) for xi in err_descs])

with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_subwordembeddings.txt"), 'wb') as f:
    np.save(f, X_wordemb)
    
X_wordemb.shape # (.., 300)

(3311, 300)

In [None]:
# this is using GloVe
# import numpy as np

# glove_vectors = cluster_util.load_glove("C:/Users/yirwa29/Downloads/Dataset-Nb/glove.6B/glove.6B.200d.txt")
# X_wordemb = np.array([cluster_util.vectorizer_word2vec(xi, glove_vectors, 200) for xi in err_descs])

# with open(config.path_default.joinpath("df_mlerr_mlbugs_filtered_dedup_embeddings_glove.txt"), 'wb') as f:
#     np.save(f, X_wordemb)
    
# X_wordemb.shape # (14518, 200)

### 2. Clustering with vectorized error values


##### 2. sentence transformers

In [8]:
# #PCA

# import numpy as np

# # with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_embeddings_transformers.txt"), 'rb') as f:
# #     X_transformers = np.load(f)
    
# n_components = cluster_util.select_pca_n_basedon_variance(X_transformers)
# X_transformers_pca = cluster_util.pca(X_transformers, n_components=n_components)

# with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_embeddings_transformers_pca.txt"), 'wb') as f:
#     np.save(f, X_transformers_pca)

pca: 115 components can explain 80.15% variance of the data


In [2]:
import numpy as np

with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_embeddings_transformers_pca.txt"), 'rb') as f:
    X_transformers_pca = np.load(f)

In [10]:
## dbscan
eps_transformers = cluster_util.epsilon_search_dbscan(X_transformers_pca)
print(eps_transformers)
res = cluster_util.cluster_dbscan(X_transformers_pca, eps=eps_transformers, min_samples=2)
df_mlerr_mlbugs_unique.loc[:,"cluster_dbscan_transformers"] = res

0.8118873240003256
Estimated no. of clusters: 51
Estimated no. of noise points: 389


In [11]:
cluster_util.eval_cluster_silhouette(X_transformers_pca[df_mlerr_mlbugs_unique['cluster_dbscan_transformers']!=-1],
                                     df_mlerr_mlbugs_unique.loc[df_mlerr_mlbugs_unique['cluster_dbscan_transformers']!=-1, 'cluster_dbscan_transformers'])

-0.122656964

In [12]:
## OPTICS
res = cluster_util.cluster_optics(X_transformers_pca, min_samples = 2)
df_mlerr_mlbugs_unique.loc[:,"cluster_optics_transformers"] = res

Estimated no. of clusters: 504
Estimated no. of noise points: 2007


In [13]:
cluster_util.eval_cluster_silhouette(X_transformers_pca[df_mlerr_mlbugs_unique['cluster_optics_transformers']!=-1],
                                     df_mlerr_mlbugs_unique.loc[df_mlerr_mlbugs_unique['cluster_optics_transformers']!=-1, 'cluster_optics_transformers'])

0.27063355

##### 3. word embeddings

In [20]:
# #PCA

# import numpy as np

# # with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_subwordembeddings.txt"), 'rb') as f:
# #     X_wordemb = np.load(f)
    
# n_components = cluster_util.select_pca_n_basedon_variance(X_wordemb)
# X_wordemb_pca = cluster_util.pca(X_wordemb, n_components=n_components)

# with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_subwordembeddings_pca.txt"), 'wb') as f:
#     np.save(f, X_wordemb_pca)

pca: 115 components can explain 80.17% variance of the data


In [8]:
import numpy as np

with open(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_subwordembeddings_pca.txt"), 'rb') as f:
    X_wordemb_pca = np.load(f)

In [25]:
## dbscan
eps_wordemb = cluster_util.epsilon_search_dbscan(X_wordemb_pca)
print(eps_wordemb)
res = cluster_util.cluster_dbscan(X_wordemb_pca, eps=eps_wordemb, min_samples=2)
df_mlerr_mlbugs_unique.loc[:,"cluster_dbscan_wordemb"] = res

1.4703763990042338
Estimated no. of clusters: 44
Estimated no. of noise points: 420


In [26]:
cluster_util.eval_cluster_silhouette(X_wordemb_pca[df_mlerr_mlbugs_unique['cluster_dbscan_wordemb']!=-1],
                                     df_mlerr_mlbugs_unique.loc[df_mlerr_mlbugs_unique['cluster_dbscan_wordemb']!=-1, 'cluster_dbscan_wordemb'])

0.04268817179429804

In [27]:
## OPTICS
res = cluster_util.cluster_optics(X_wordemb_pca, min_samples = 2)
df_mlerr_mlbugs_unique.loc[:,"cluster_optics_wordemb"] = res

Estimated no. of clusters: 388
Estimated no. of noise points: 2353


In [28]:
cluster_util.eval_cluster_silhouette(X_wordemb_pca[df_mlerr_mlbugs_unique['cluster_optics_wordemb']!=-1],
                                     df_mlerr_mlbugs_unique.loc[df_mlerr_mlbugs_unique['cluster_optics_wordemb']!=-1, 'cluster_optics_wordemb'])

0.303413752701044

In [32]:
df_mlerr_mlbugs_unique.to_excel(config.path_default.joinpath("df_mlerr_mlbugs_pregroup_dedup_clustered.xlsx"), index=False, engine='xlsxwriter')