In [155]:
import pandas as pd
import numpy as np

import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [156]:
with open("issues_final.json", 'r') as clean_json_file:
    data = json.load(clean_json_file)
    issues_df=pd.DataFrame(data, columns=['id', 'title', 'body', 'comments', 'originalId', 'originalTitle', 'originalBody'])
    stop_words.add("flutter")

    stop_words_l=stop_words

    issues_df['title_cleaned']=issues_df.title.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['body_cleaned']=issues_df.body.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in str(x).split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['original_title_cleaned']=issues_df.originalTitle.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['original_body_cleaned']=issues_df.originalBody.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

    print(issues_df.head(1))

       id                        title  \
0  102139  [A11y] Scaffold bottomSheet   

                                                body  \
0  ## Steps to Reproduce\r\n1. Add the following ...   

                                            comments  originalId  \
0  [Hi @838, Thanks for filing the issue.  Looks ...       74246   

                                     originalTitle  \
0  Proposal to add iOS VoiceOver Two-finger scrub    

                                        originalBody  \
0  \r\nThere is a VoiceOver gesture to navigate t...   

                 title_cleaned  \
0   a11y  scaffold bottomsheet   

                                        body_cleaned  \
0     steps reproduce 1  add following code count...   

                        original_title_cleaned  \
0  proposal add ios voiceover two finger scrub   

                               original_body_cleaned  
0  voiceover gesture navigate previous screen  tw...  


In [157]:
tfidfvectoriser=TfidfVectorizer(max_features=64)
#tfidfvectoriser.fit(issues_df.title_cleaned)
tfidf_vectors=tfidfvectoriser.fit_transform(issues_df.title_cleaned)


In [158]:
# tokenize and pad every document to make them of the same size
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


tokenizer=Tokenizer()
tokenizer.fit_on_texts(issues_df.title_cleaned)

tokenized_issues_title=tokenizer.texts_to_sequences(issues_df.title_cleaned)
tokenized_paded_issues_title=pad_sequences(tokenized_issues_title,maxlen=64,padding='post')

vocab_size=len(tokenizer.word_index)+1

print (tokenized_paded_issues_title[0])

[273 274 449   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


In [63]:
# loading pre-trained embeddings, each word is represented as a 300 dimensional vector
import gensim
W2V_PATH="/content/drive/MyDrive/Google Word2vec/GoogleNews-vectors-negative300.bin"
model_w2v = gensim.models.KeyedVectors.load_word2vec_format(W2V_PATH, binary=True)

KeyboardInterrupt: ignored

In [159]:
# creating embedding matrix, every row is a vector representation from the vocabulary indexed by the tokenizer index. 
embedding_matrix=np.zeros((vocab_size,300))
for word,i in tokenizer.word_index.items():
    if word in model_w2v:
        embedding_matrix[i]=model_w2v[word]

# creating issue-title-word embeddings
issues_title_word_embeddings=np.zeros((len(tokenized_paded_issues_title),64,300))
for i in range(len(tokenized_paded_issues_title)):
    for j in range(len(tokenized_paded_issues_title[0])):
        issues_title_word_embeddings[i][j]=embedding_matrix[tokenized_paded_issues_title[i][j]]
issues_title_word_embeddings.shape

(387, 64, 300)

In [160]:
def most_similar(issue_id,similarity_matrix,matrix):
    
    similarCounter = 0
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[issue_id])
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[issue_id])


    for ix in similar_ix:
        if ix >= issue_id:
            continue
        if (matrix == 'Euclidean Distance' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] < 1.1):
            similarCounter+=1

        if (matrix == 'Cosine Similarity' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] > 0.8):
            similarCounter+=1
    
    if similarCounter > 0:
        print (f'Issue: {issues_df.iloc[issue_id]["title_cleaned"]}')
        print('\n')
        print('Similar Issues:')
        
    for ix in similar_ix:
        if ix >= issue_id:
            continue
        if (matrix == 'Euclidean Distance' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] < 1.1):
          print (f'Issue: {issues_df.iloc[ix]["title_cleaned"]}')
          print (f'{matrix} : {similarity_matrix[issue_id][ix]}')
          print('\n')

        if (matrix == 'Cosine Similarity' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] > 0.8):
          print (f'Issue: {issues_df.iloc[ix]["title_cleaned"]}')
          print (f'{matrix} : {similarity_matrix[issue_id][ix]}')
          print('\n')


In [161]:
# calculating average of word vectors of a issue weighted by tf-idf
issues_title_embeddings=np.zeros((len(tokenized_paded_issues_title),300))
words=tfidfvectoriser.get_feature_names()
x = 0
for i in range(len(issues_title_word_embeddings)):
    for j in range(len(words)):
        issues_title_embeddings[i]+=embedding_matrix[tokenizer.word_index[words[j]]]*tfidf_vectors.toarray()[i][j]

pairwise_similarities=cosine_similarity(issues_title_embeddings)
pairwise_differences=euclidean_distances(issues_title_embeddings)
#most_similar(0,pairwise_similarities,'Cosine Similarity')
#most_similar(0,pairwise_differences,'Euclidean Distance')






In [162]:
for i in range(10, 300):
  most_similar(i,pairwise_differences,'Euclidean Distance')  
  most_similar(i,pairwise_similarities,'Cosine Similarity')

Issue: implementation found method getapplicationdocumentsdirectory channel plugins flutter io path provider


Similar Issues:
Issue:  webview flutter  navigation back surfaceandroidview bit jank
Cosine Similarity : 0.9999999999999998


Issue: imagefilter dilate imagefilter erode supported web 


Similar Issues:
Issue:  web  hot reload restart javascript files
Cosine Similarity : 0.9999999999999998


Issue:  web  debugger always stops web entrypoint  webonlyinitializeplatform 


Similar Issues:
Issue:  web  hot reload restart javascript files
Cosine Similarity : 0.9999999999999998


Issue: imagefilter dilate imagefilter erode supported web 
Cosine Similarity : 0.9999999999999998


Issue:  tool crash  stateerror  bad state  context running dart application 


Similar Issues:
Issue:  tool crash  stateerror  bad state  future already completed
Cosine Similarity : 0.8064340512576635


Issue:  tool crash  filesystemexception  failed decode data using encoding  utf 8   null


Similar Issues: