In [1]:
import pandas as pd
import numpy as np

import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/clk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
with open("issues_final.json", 'r') as clean_json_file:
    data = json.load(clean_json_file)
    issues_df=pd.DataFrame(data, columns=['id', 'title', 'body', 'comments', 'originalId', 'originalTitle', 'originalBody'])

    stop_words_l=stopwords.words('english')

    issues_df['title_cleaned']=issues_df.title.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['body_cleaned']=issues_df.body.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in str(x).split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['original_title_cleaned']=issues_df.originalTitle.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )
    issues_df['original_body_cleaned']=issues_df.originalBody.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z0-9]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_l) )

    print(issues_df.head())

       id                                              title  \
0  102139                        [A11y] Scaffold bottomSheet   
1  102138  Platform.executable return null  in Flutter De...   
2  102135  When using the physical keyboard on the ipad, ...   
3  102076  [cross_file] file.name always returns empty st...   
4  102068  [tool_crash] NoSuchMethodError: NoSuchMethodEr...   

                                                body  \
0  ## Steps to Reproduce\r\n1. Add the following ...   
1  Platform.executable  always return  null  in F...   
2  ## Steps to Reproduce\r\n\r\n1. Execute `flutt...   
3  `XFile.fromData` implementation of `io` platfo...   
4  ## Command\r\n```\r\nflutter run --flavor ship...   

                                            comments  originalId  \
0  [Hi @838, Thanks for filing the issue.  Looks ...       74246   
1  [Hi @LaiZhou, Thanks for filing the issue. I c...      102138   
2  [Hi @hatano0x06, Thanks for filing the issue. ...       99652   
3  [Du

In [7]:
issues_df['title'].head(5)

0                          [A11y] Scaffold bottomSheet
1    Platform.executable return null  in Flutter De...
2    When using the physical keyboard on the ipad, ...
3    [cross_file] file.name always returns empty st...
4    [tool_crash] NoSuchMethodError: NoSuchMethodEr...
Name: title, dtype: object

In [4]:
issues_df['title_cleaned'].head(5)

0                           a11y  scaffold bottomsheet
1    platform executable return null flutter deskto...
2    using physical keyboard ipad  erased character...
3     cross file  file name always returns empty st...
4     tool crash  nosuchmethoderror  nosuchmethoder...
Name: title_cleaned, dtype: object

In [3]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [6]:
document_embeddings = sbert_model.encode(issues_df['title_cleaned'])

In [7]:
document_embeddings

array([[-0.02009699,  0.50457424,  0.54086137, ...,  0.29038247,
         0.20009921, -0.36221623],
       [-0.34912118,  0.6034052 ,  0.34870645, ..., -0.45596185,
         0.44035032, -0.02409698],
       [-0.38362905,  0.96553   ,  0.6277289 , ..., -0.7671566 ,
        -0.6677468 ,  0.30477855],
       ...,
       [ 0.44378027,  1.1727988 ,  1.2052165 , ..., -0.18071601,
        -0.5482578 ,  0.1302988 ],
       [-0.33703682,  0.31050774,  2.145817  , ..., -1.1442635 ,
        -0.62470275, -0.12597424],
       [ 0.05698044, -0.60508037,  2.0144083 , ..., -0.74232686,
        -0.6255462 ,  0.2711879 ]], dtype=float32)

In [None]:
pairwise_similarities=cosine_similarity(document_embeddings)
pairwise_differences=euclidean_distances(document_embeddings)


In [None]:
def most_similar(issue_id,similarity_matrix,matrix):
    print("Issue: ", issue_id,"*" * 20)
    print (f'Issue title: {issues_df.iloc[issue_id]["title_cleaned"]}')
    print ('\n')
    print ('Similar Issues:')
    if matrix=='Cosine Similarity':
        similar_ix=np.argsort(similarity_matrix[issue_id])[::-1]
    elif matrix=='Euclidean Distance':
        similar_ix=np.argsort(similarity_matrix[issue_id])
    for ix in similar_ix:
        if ix == issue_id:
            continue
        print('\n')
        print (f'Document: {issues_df.iloc[ix]["title_cleaned"]}')
        print (f'{matrix} : {similarity_matrix[issue_id][ix]}')
        # if (matrix == 'Euclidean Distance' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] < 1.1):
        #   print (f'Issue: {issues_df.iloc[ix]["title_cleaned"]}')
        #   print (f'{matrix} : {similarity_matrix[issue_id][ix]}')
        #   print('\n')

        # if (matrix == 'Cosine Similarity' and similarity_matrix[issue_id][ix] != 0 and similarity_matrix[issue_id][ix] > 0.35):
        #   print (f'Issue: {issues_df.iloc[ix]["title_cleaned"]}')
        #   print (f'{matrix} : {similarity_matrix[issue_id][ix]}')
        #   print('\n')


In [None]:
for i in range(0, issues_df.shape[0]):
  most_similar(i,pairwise_similarities,'Cosine Similarity')
  most_similar(i,pairwise_differences,'Euclidean Distance')  

NameError: name 'issues_df' is not defined