In [124]:
import pandas as pd
import regex as re
from bs4 import BeautifulSoup
from bertopic.representation import KeyBERTInspired, PartOfSpeech, MaximalMarginalRelevance
from bertopic import BERTopic
from hdbscan import HDBSCAN
from umap import UMAP


import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/cs/grad/opumni/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
def decontracted(phrase):
    if not isinstance(text, str):
        return ""
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def replace_file_paths(text):
    if not isinstance(text, str):
        return ""
    # Regex pattern for detecting file paths (both Linux and Windows)
    # Matches Windows paths (C:\path\to\file) and Linux paths (/path/to/file)
    pattern = r'([A-Za-z]:\\(?:[\w\-\s]+\\)*[\w\-\s]+\.[a-zA-Z0-9]+)|(/\S+)'

    # Replace file paths with 'PATH'
    cleaned_text = re.sub(r'https?://[\n\S]+\b', 'URL', text)
    cleaned_text = re.sub(pattern, 'PATH', cleaned_text)
    return cleaned_text

def replace_code_identifiers(text):
    if not isinstance(text, str):
        return ""
    # Regex pattern to match function calls, compound identifiers, and camelCase names
    pattern = r'\b[A-Za-z_][A-Za-z0-9_.\-]*\(\)|\b[A-Z_]+:[A-Za-z0-9_]+|\b[a-zA-Z_][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]*'

    
    cleaned_text = re.sub("\w+_\w+", 'CODE', text)
    
    # Replace matches with 'CODE'
    cleaned_text = re.sub(pattern, 'CODE', cleaned_text)

    return cleaned_text

def replace_numbers(text):
    if not isinstance(text, str):
        return ""
    cleaned_text = re.sub(r'\d+', ' NUMBER ', text)

    return cleaned_text

def replace_quoted_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace('"','\'')
    text = text.replace('`','\'')

    # Regex pattern to match text inside single, double, and backticks but not apostrophes inside words
    pattern = r'([\'"`])[^\'"`]*?\1'

    # Replace quoted text with 'TEXT'
    cleaned_text = re.sub(pattern, ' QUOTE ', text)

    return cleaned_text

def remove_commands(text):
    if not isinstance(text, str):
        return ""
    # Define lists of common Linux and Windows commands
    linux_commands = ['ls', 'cat', 'cd', 'grep', 'mkdir', 'rm', 'touch', 'pwd', 'chmod', 'cp', 'mv']
    windows_commands = ['dir', 'cls', 'copy', 'del', 'echo', 'mkdir', 'rmdir', 'type', 'move', 'ren']

    # Combine the commands into a single list
    all_commands = linux_commands + windows_commands

    # Escape commands to make sure they are valid regex patterns
    escaped_commands = [re.escape(cmd) for cmd in all_commands]

    # Create a regex pattern to match any of the commands
    pattern = r'\b(?:' + '|'.join(escaped_commands) + r')\b'
    # Replace commands with an empty string
    cleaned_text = re.sub(pattern, ' COMMAND ', text)

    return cleaned_text

In [None]:
ISSUE_DATA_PATH = "./Data/AllIssues.csv"
df = pd.read_csv(ISSUE_DATA_PATH, low_memory=False)
#df = df.dropna(axis=0)
df['Body'] = df['Body'].astype(str).fillna('')
df['combinedText'] = df['Title'] + ' ' + df['Body']

#df["combinedText"] = df["combinedText"].apply(lambda x: decontracted(x))
#df["combinedText"] = df["combinedText"].apply(lambda x: remove_commands(x))
df["combinedText"] = df["combinedText"].apply(lambda x: replace_file_paths(x))
#df["combinedText"] = df["combinedText"].apply(lambda x: replace_code_identifiers(x))
#df["combinedText"] = df["combinedText"].apply(lambda x: replace_quoted_text(x))
#df["combinedText"] = df["combinedText"].apply(lambda x: replace_numbers(x))

punc_tokenizer = RegexpTokenizer(r'\w+')
df["combinedText"] = df["combinedText"].apply(lambda x: " ".join(punc_tokenizer.tokenize(x)))

stop_words = set(stopwords.words('english'))
df["combinedText"] = df["combinedText"].apply(lambda x:" ".join([w for w in word_tokenize(x) if not w.lower() in stop_words]))

lemmatizer = WordNetLemmatizer()
df["combinedText"] = df["combinedText"].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in  word_tokenize(x)]))

data = df['combinedText'].tolist()

In [128]:
df

Unnamed: 0,Repository,IssueId,Title,Body,State,Label,CreatedAt,ClosedAt,combinedText
18,askmike/gekko,2872,Candle Tick drifts if API fails to return data...,**I'm submitting a ...**\r\n[x] bug report\r\n...,closed,wontfix,2019-12-08 20:13:08+00:00,2020-02-13 21:02:53+00:00,Candle Tick drift API fails return data X minu...
19,askmike/gekko,2871,Reconciliation of overlapping datasets,Separate import jobs could retrieve data for o...,closed,wontfix,2019-11-27 12:22:17+00:00,2020-02-02 13:38:27+00:00,Reconciliation overlapping datasets Separate i...
20,askmike/gekko,2870,Binance: can't have BTC as currency and USDT a...,"**Note: this is the technical bug tracker, ple...",closed,wontfix,2019-11-18 13:54:22+00:00,2020-01-24 15:56:51+00:00,Binance BTC currency USDT asset accumulate USD...
21,askmike/gekko,2869,How to exit from a strategy ?,"Hello People, i've tried so far on my own, but...",closed,wontfix,2019-11-13 22:31:20+00:00,2020-01-20 01:12:55+00:00,exit strategy Hello People tried far need help...
24,askmike/gekko,2866,Too accurate,"Hi Mike,\r\n\r\nI am getting an error on certa...",closed,wontfix,2019-11-06 08:13:16+00:00,2020-01-12 09:51:23+00:00,accurate Hi Mike getting error certain pair si...
...,...,...,...,...,...,...,...,...,...
332225,simplestaking/tezos-wallet,4,Bump jquery from 3.4.1 to 3.5.1,Bumps [jquery](https://github.com/jquery/jquer...,closed,dependencies,2020-05-06 20:21:39+00:00,2022-05-16 16:45:00+00:00,Bump jquery 3 4 1 3 5 1 Bumps jquery URL 3 4 1...
332226,simplestaking/tezos-wallet,3,Bump jquery from 3.4.1 to 3.5.0,Bumps [jquery](https://github.com/jquery/jquer...,closed,dependencies,2020-04-30 12:26:05+00:00,2020-05-06 20:21:41+00:00,Bump jquery 3 4 1 3 5 0 Bumps jquery URL 3 4 1...
332227,simplestaking/tezos-wallet,2,Bump acorn from 5.7.3 to 5.7.4,Bumps [acorn](https://github.com/acornjs/acorn...,closed,dependencies,2020-03-15 15:34:39+00:00,2020-03-21 20:30:59+00:00,Bump acorn 5 7 3 5 7 4 Bumps acorn URL 5 7 3 5...
332232,ampleforth/market-oracle,73,Bump elliptic from 6.4.1 to 6.5.3,Bumps [elliptic](https://github.com/indutny/el...,closed,dependencies,2020-12-23 17:13:42+00:00,2021-01-22 17:28:38+00:00,Bump elliptic 6 4 1 6 5 3 Bumps elliptic URL 6...


In [127]:
representation_model = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)]

umap_model = UMAP(n_neighbors=15, n_components=100, metric='cosine', low_memory=False)
hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', prediction_data=True)
#umap_model=umap_model,hdbscan_model=hdbscan_model, 
topic_model = BERTopic(calculate_probabilities=True, nr_topics='auto', language="english", n_gram_range = (1,3))
topics, probs = topic_model.fit_transform(data)

KeyboardInterrupt: 

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(data)

100%|██████████| 6/6 [00:00<00:00, 13.95it/s]


In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5059,-1_path_url_error_wallet,"[path, url, error, wallet, 12, transaction, ve...",[HttpRequestException Bad Gateway General Desc...
1,0,4834,0_url_path_10_error,"[url, path, 10, error, info, 03, 05, wallet, 1...",[Wallet Frozen coin relevant currently UTXOs C...
2,1,270,1_scroll_screen_page_url,"[scroll, screen, page, url, mobile, scrolling,...",[page fully scrollable landscape mode iOS Andr...
3,2,118,2_comment_post_reply_url,"[comment, post, reply, url, changes, fix, clic...",[Comment form cleared upvote Thanks opening is...
4,3,99,3_language_english_translation_url,"[language, english, translation, url, locale, ...",[Localization break due PATH getting cached wr...
...,...,...,...,...,...
74,73,12,73_picker_caret_singleuserpicker_disabled,"[picker, caret, singleuserpicker, disabled, wi...",[Fix typo issue across app Description PR fix ...
75,74,11,74_height_block_block height_01 13 inf,"[height, block, block height, 01 13 inf, 13 in...",[Latest block list broken new block arrives up...
76,75,11,75_information crash crashlytics_lot informati...,"[information crash crashlytics, lot informatio...",[ViewBlockExtraDetailsActivity java line 37 co...
77,76,11,76_java_processor_apache kafka_org,"[java, processor, apache kafka, org, apache, o...",[Reconnectivity issue fetching complete Block ...


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
topic_model.visualize_heatmap()