# Load Modules

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import gensim
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

#Link type map
from tld.linktypes import fine_linktype_map

## Repositories and Loading Functions

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
           'JiraEcosystem', 'MariaDB', 'Mojang', 'MongoDB', 
           'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

# 'Mindville' too small

link_text_dict = {}
countvect_dict = {}
tfidfvect_dict = {}

In [56]:
def load_data(source):
    #Loading Issues
    filename = 'data/processed/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')

    #Loading Links
    filename = 'data/processed/links_plus_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    link_df['mappedtype'] = link_df['linktype'].map(fine_linktype_map)
    
    return issue_df, link_df

In [57]:
def get_tf_idf_sim(x, y):
    x_tfidf = tfidf_vectorizer.transform(x)
    y_tfidf = tfidf_vectorizer.transform(y)
    cosim = cosine_similarity(x_tfidf, y_tfidf).flatten()
    return float(cosim)

## Load Data and Calculate Cosine Similarities

In [None]:
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    
    issue_df['title'].fillna(' ', inplace=True)
    issue_df['description'].fillna(' ', inplace=True)
    
    linked_issues = set(link_df['issue_id_1']).union(set(link_df['issue_id_2']))
    linked_issue_df = issue_df[issue_df['issue_id'].isin(linked_issues)]
    
    linked_issue_df['text'] = linked_issue_df['title']+ " " + linked_issue_df['description']
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=0.05, ngram_range=(1,2))
    tfidf_wm = tfidf_vectorizer.fit_transform(list(linked_issue_df['text'].values))
    tfidf_tokens = tfidf_vectorizer.get_feature_names()
    df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = [linked_issue_df['issue_id']],columns = tfidf_tokens)
    tfidfvect_dict[s] = df_tfidfvect
    
    countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english', min_df=0.05)
    count_wm = countvectorizer.fit_transform(list(linked_issue_df['text'].values))
    count_tokens = countvectorizer.get_feature_names()
    df_countvect = pd.DataFrame(data = count_wm.toarray(),index = [linked_issue_df['issue_id']],columns = count_tokens)
    countvect_dict[s] = df_countvect
    
    issue_text_data = linked_issue_df[['text', 'issue_id']]
    
    link_text_df = link_df.merge(issue_text_data, left_on='issue_id_1', right_on='issue_id')
    link_text_df = link_text_df.merge(issue_text_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
    
    cosims = []

    for i in tqdm(link_text_df.index):
        cosims.append(get_tf_idf_sim([link_text_df['text_1'].iloc[i]], [link_text_df['text_2'].iloc[i]]))

    link_text_df['cosim'] = cosims
                  
    link_text_dict[s] = link_text_df 

APACHE


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['text'] = linked_issue_df['title']+ " " + linked_issue_df['description']
  link_text_df = link_text_df.merge(issue_text_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
 86%|████████▋ | 231290/267400 [1:28:01<13:47, 43.62it/s]  

In [None]:
relevant_types = ['Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']

median = []
mean = []
std = []

for s in SOURCES:
    temp_df = link_text_dict[s]
    
    temp_median = [s]
    temp_mean = [s]
    temp_std = [s]
    
    for i in relevant_types:
        if i in set(temp_df['mappedtype'].unique()):
            temp_link_df = temp_df[temp_df['mappedtype']==i]
            temp_median.append(temp_link_df['cosim'].median())
            temp_mean.append(temp_link_df['cosim'].mean())
            temp_std.append(temp_link_df['cosim'].std())
        else:
            temp_median.append("not in repo")
            temp_mean.append("not in repo")
            temp_std.append("not in repo")
            
    median.append(temp_median)
    mean.append(temp_mean)
    std.append(temp_std)


In [23]:
cols = ['Repo', 'Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']
median_df = pd.DataFrame(median, columns = cols)
mean_df = pd.DataFrame(mean, columns = cols)
std_df = pd.DataFrame(std, columns = cols)

In [24]:
def special_round(x, i):
    try:
        if i == 0:
            y = int(round(x, 0))
        else:
            y = round(x, i)
    except: 
        y = None
    return y

## Table 5 for textual cosine similarity per link type

In [25]:
cossim_df = median_df.set_index('Repo')
cossim_df.round(decimals = 2)

for i in cossim_df.columns:
    cossim_df[i] = cossim_df[i].apply(lambda x: special_round(x,2))
    
print(cossim_df.to_latex())

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Relate &  Duplicate &  Subtask &  Depend &  Clone &  Incorporate &  Epic &  Block &  Cause &  Non-Link \\
Repo          &         &            &          &         &        &              &       &        &        &           \\
\midrule
Apache        &    0.14 &       0.29 &     0.00 &    0.08 &   0.90 &         0.07 &  0.00 &   0.03 &   0.09 &      0.00 \\
Hyperledger   &    0.33 &       0.44 &     0.21 &     NaN &   0.95 &          NaN &  0.06 &   0.31 &    NaN &      0.00 \\
IntelDAOS     &    0.18 &       0.29 &     0.07 &     NaN &   0.99 &          NaN &   NaN &   0.11 &    NaN &      0.00 \\
JFrog         &    0.37 &       0.38 &     0.00 &    0.28 &   1.00 &         0.27 &   NaN &    NaN &    NaN &      0.02 \\
Jira          &    0.89 &       0.44 &     0.06 &    0.49 &   0.92 &         0.40 &   NaN &   0.29 &   0.34 &      0.03 \\
JiraEcosystem &    0.25 &       0.48 &     0.00 &    0.15 &   0.94 &         0.13 &  0.00 &   0.13 &   

  print(cossim_df.to_latex())


In [32]:
print('MEAN.')
print(round(cossim_df.mean(axis = 0, skipna = True),2))
print('STANDARD DEV.')
print(round(cossim_df.std(axis = 0, skipna = True),2))

MEAN.
Relate         0.28
Duplicate      0.35
Subtask        0.09
Depend         0.20
Clone          0.83
Incorporate    0.15
Epic           0.05
Block          0.17
Cause          0.19
Non-Link       0.01
dtype: float64
STANDARD DEV.
Relate         0.18
Duplicate      0.08
Subtask        0.09
Depend         0.12
Clone          0.24
Incorporate    0.14
Epic           0.12
Block          0.09
Cause          0.09
Non-Link       0.01
dtype: float64


## Preparations for text length and differences in length tables

In [29]:
common_link_text_dict = {}
for s in SOURCES:
    linktypes = (link_text_dict[s].mappedtype.value_counts()>=len(link_text_dict[s])*0.01).rename_axis('mappedtype').reset_index(name='valid')
    valid_types = set(linktypes[linktypes['valid']==True]['mappedtype'])

    all_data = link_text_dict[s][(link_text_dict[s]["mappedtype"].isin(valid_types))]
    
    common_link_text_dict[s] = all_data

In [31]:
link_issue_df_dict = {}

for s in SOURCES:
    issue_df, link_df = load_data(s)
    
    issue_df['title'].fillna(' ', inplace=True)
    issue_df['description'].fillna(' ', inplace=True)
    
    linked_issues = set(link_df['issue_id_1']).union(set(link_df['issue_id_2']))
    linked_issue_df = issue_df[issue_df['issue_id'].isin(linked_issues)]
    
    titles = linked_issue_df['title']
    title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
    titles = linked_issue_df['description']
    desc_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
    
    linked_issue_df['title_len'] = title_lengths
    linked_issue_df['desc_len'] = desc_lengths
    
    issue_text_data = linked_issue_df[['title_len', 'desc_len', 'issue_id']]
    
    link_issue_df_dict[s] = linked_issue_df
    
    link_text_df = common_link_text_dict[s]
    
    link_text_df = link_df.merge(issue_text_data, left_on='issue_id_1', right_on='issue_id')
    link_text_df = link_text_df.merge(issue_text_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
    
    link_text_df['diff_title'] =  link_text_df['title_len_1'] - link_text_df['title_len_2']
    link_text_df['diff_desc'] =  link_text_df['desc_len_1'] - link_text_df['desc_len_2']
    
    common_link_text_dict[s] = link_text_df

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  desc_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['title_len'] = title_lengths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['desc_len'] = desc_lengths
 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  desc_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['title_len'] = title_lengths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['desc_len'] = desc_lengths
 

  link_text_df = link_text_df.merge(issue_text_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  desc_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['title_len'] = title_lengths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pa

  link_text_df = link_text_df.merge(issue_text_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  desc_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  linked_issue_df['title_len'] = title_lengths
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pa

In [41]:
for s in SOURCES:
    common_link_text_dict[s]['mappedtype'] = common_link_text_dict[s]['linktype'].map(fine_linktype_map)

In [42]:
for s in SOURCES:
    temp_df = common_link_text_dict[s]
    temp_df['diff_text'] = temp_df['diff_title'] + temp_df['diff_desc']
    temp_df['diff_text'] = temp_df['diff_text'].apply(lambda x: abs(x))
    temp_df['diff_title'] = temp_df['diff_title'].apply(lambda x: abs(x))
    temp_df['diff_desc'] = temp_df['diff_desc'].apply(lambda x: abs(x))
    
    common_link_text_dict[s] = temp_df

In [43]:
for s in SOURCES:
    
    temp_df = common_link_text_dict[s]
    
    temp_df['title_len'] = temp_df['title_len_1']+temp_df['title_len_2']
    temp_df['desc_len'] = temp_df['desc_len_1']+temp_df['desc_len_2']
    
    temp_df['text_len'] = temp_df['title_len']+temp_df['desc_len']
    
    
    
    common_link_text_dict[s] = temp_df

## Table 6 Text lengths per link type

In [44]:
relevant_types = ['Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']

median = []
mean = []
std = []

for s in SOURCES:
    temp_df = common_link_text_dict[s]
    
    temp_median = [s]
    temp_mean = [s]
    temp_std = [s]
    
    for i in relevant_types:
        if i in set(temp_df['mappedtype'].unique()):
            temp_link_df = temp_df[temp_df['mappedtype']==i]
            temp_median.append(temp_link_df['text_len'].median())
            temp_mean.append(temp_link_df['text_len'].mean())
            temp_std.append(temp_link_df['text_len'].std())
        else:
            temp_median.append("not in repo")
            temp_mean.append("not in repo")
            temp_std.append("not in repo")
            
    median.append(temp_median)
    mean.append(temp_mean)
    std.append(temp_std)


In [45]:
cols = ['Repo', 'Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']

median_df = pd.DataFrame(median, columns = cols)
mean_df = pd.DataFrame(mean, columns = cols)
std_df = pd.DataFrame(std, columns = cols)

In [46]:
textlen_df = median_df.set_index('Repo')

for i in textlen_df.columns:
    textlen_df[i] = textlen_df[i].apply(lambda x: special_round(x, 0))
    
print(textlen_df.to_latex())

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Relate &  Duplicate &  Subtask &  Depend &  Clone &  Incorporate &   Epic &  Block &  Cause &  Non-Link \\
Repo          &         &            &          &         &        &              &        &        &        &           \\
\midrule
Apache        &     157 &      159.0 &     87.0 &   114.0 &   98.0 &        120.0 &   93.0 &  109.0 &  172.0 &       133 \\
Hyperledger   &     176 &      159.0 &     79.0 &     NaN &   92.0 &          NaN &   96.0 &  131.0 &    NaN &       105 \\
IntelDAOS     &     268 &      398.0 &     90.0 &     NaN &  123.0 &          NaN &    NaN &  297.0 &    NaN &       142 \\
JFrog         &     152 &      172.0 &      9.0 &    70.0 &  260.0 &        195.0 &    NaN &    NaN &    NaN &       131 \\
Jira          &     192 &      182.0 &     76.0 &   158.0 &  180.0 &        183.0 &    NaN &  161.0 &  244.0 &       168 \\
JiraEcosystem &     119 &      104.0 &     36.0 &   105.0 &  108.0 &         93.0 &   51.0 &   8

  print(textlen_df.to_latex())


In [54]:
print('MEAN.')
print(round(textlen_df.mean(axis = 0, skipna = True),0))
print('STANDARD DEV.')
print(round(textlen_df.std(axis = 0, skipna = True),0))

MEAN.
Relate         187.0
Duplicate      196.0
Subtask         69.0
Depend         113.0
Clone          132.0
Incorporate    138.0
Epic            80.0
Block          176.0
Cause          215.0
Non-Link       139.0
dtype: float64
STANDARD DEV.
Relate          75.0
Duplicate       96.0
Subtask         25.0
Depend          25.0
Clone           44.0
Incorporate     44.0
Epic            23.0
Block           86.0
Cause          104.0
Non-Link        65.0
dtype: float64


## Table 7 Text length differences per link type

In [49]:
relevant_types = ['Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']

median = []
mean = []
std = []

for s in SOURCES:
    temp_df = common_link_text_dict[s]
    
    temp_median = [s]
    temp_mean = [s]
    temp_std = [s]
    
    for i in relevant_types:
        if i in set(temp_df['mappedtype'].unique()):
            temp_link_df = temp_df[temp_df['mappedtype']==i]
            temp_median.append(temp_link_df['diff_text'].median())
            temp_mean.append(temp_link_df['diff_text'].mean())
            temp_std.append(temp_link_df['diff_text'].std())
        else:
            temp_median.append("not in repo")
            temp_mean.append("not in repo")
            temp_std.append("not in repo")
            
    median.append(temp_median)
    mean.append(temp_mean)
    std.append(temp_std)


In [50]:
cols = ['Repo', 'Relate', 'Duplicate', 'Subtask', 'Depend', 'Clone', 'Incorporate', 'Epic', 'Block', 'Cause', 'Non-Link']

median_df = pd.DataFrame(median, columns = cols)
mean_df = pd.DataFrame(mean, columns = cols)
std_df = pd.DataFrame(std, columns = cols)

In [51]:
textdiff_df = median_df.set_index('Repo')

for i in textdiff_df.columns:
    textdiff_df[i] = textdiff_df[i].apply(lambda x: special_round(x, 0))
    
print(textdiff_df.to_latex())

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &  Relate &  Duplicate &  Subtask &  Depend &  Clone &  Incorporate &  Epic &  Block &  Cause &  Non-Link \\
Repo          &         &            &          &         &        &              &       &        &        &           \\
\midrule
Apache        &      50 &       43.0 &     34.0 &    37.0 &    3.0 &         41.0 &  44.0 &   36.0 &   62.0 &        52 \\
Hyperledger   &      58 &       50.0 &     35.0 &     NaN &    2.0 &          NaN &  48.0 &   46.0 &    NaN &        46 \\
IntelDAOS     &      91 &      146.0 &     30.0 &     NaN &    2.0 &          NaN &   NaN &  147.0 &    NaN &        59 \\
JFrog         &      44 &       49.0 &      5.0 &    18.0 &    2.0 &         61.0 &   NaN &    NaN &    NaN &        60 \\
Jira          &       0 &       43.0 &     33.0 &    39.0 &    8.0 &         49.0 &   NaN &   43.0 &   68.0 &        54 \\
JiraEcosystem &      32 &       24.0 &     12.0 &    36.0 &    4.0 &         31.0 &  28.0 &   24.0 &   

  print(textdiff_df.to_latex())


In [55]:
print('MEAN.')
print(round(textdiff_df.mean(axis = 0, skipna = True),0))
print('STANDARD DEV.')
print(round(textdiff_df.std(axis = 0, skipna = True),0))

MEAN.
Relate         54.0
Duplicate      57.0
Subtask        26.0
Depend         37.0
Clone           9.0
Incorporate    54.0
Epic           38.0
Block          66.0
Cause          75.0
Non-Link       55.0
dtype: float64
STANDARD DEV.
Relate         31.0
Duplicate      34.0
Subtask        10.0
Depend          9.0
Clone          13.0
Incorporate    21.0
Epic           10.0
Block          51.0
Cause          41.0
Non-Link       31.0
dtype: float64
