# Load Modules

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import gensim
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import seaborn as sns
import matplotlib.pyplot as plt

#Link type map
from tld.linktypes import fine_linktype_map

## Repositories and Loading Functions

In [4]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
           'JiraEcosystem', 'MariaDB', 'Mindville', 'Mojang', 'MongoDB', 
           'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [3]:
def load_data(source):
    #Loading Issues
    filename = 'data/processed/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')

    #Loading Links
    filename = 'data/processed/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';')
    
    return issue_df, link_df

In [5]:
# Load Data
link_dict = {}
issue_dict = {}

for s in SOURCES:
    print(s.upper())
    issues, links = load_data(s)
    link_dict[s] = links
    issue_dict[s] = issues


APACHE
HYPERLEDGER
INTELDAOS
JFROG
JIRA
JIRAECOSYSTEM
MARIADB
MINDVILLE
MOJANG
MONGODB
QT
REDHAT
SAKAI
SECONDLIFE
SONATYPE
SPRING


## Tables 1: Overview of Data

In [66]:
def print_overview(SOURCE):
    
    issues = issue_dict[SOURCE]
    links = link_dict[SOURCE]
    
    links['mappedtype'] = links['linktype'].map(fine_linktype_map)
        
    num_links = len(links)
    num_issues = len(issues)
    
    issue_set = set(issues.index.values)
    link_set = set(links['issue_id_1']).union(set(links['issue_id_2']))

    projects = issues[['projectid', 'issue_id']]
    
    temp = links.merge(projects, left_on='issue_id_1', right_on='issue_id')
    temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
    
    temp['sameproject'] = (temp["projectid_1"]==temp["projectid_2"])

    cp_df = temp.mappedtype.value_counts().rename_axis('mappedtype').reset_index(name='counts')

    try:
        cross_ratio = round(temp['sameproject'].value_counts().loc[False]/len(temp)*100, 2)
    except:
        cross_ratio = 0

    cp_df['source'] = SOURCE
    cp_df['percentages'] = cp_df['counts'].apply(lambda x: round(x/len(links),3)*100)

    cp_df['crossproject'] = 0
    for i in temp.mappedtype.value_counts().index:
        link_temp = temp[temp['mappedtype'] == i]
        try:
            cross_ratio_temp = round(link_temp['sameproject'].value_counts().loc[False]/len(link_temp)*100, 2)
        except:
            cross_ratio_temp = 0
        link_index = cp_df.index[cp_df['mappedtype']==i]
        link_index = list(link_index)
        cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
        
    return num_issues, num_links, len(links.mappedtype.unique()), len(issues.projectid.unique()), round(len(link_set)/len(issue_set), 3), cross_ratio, cp_df


In [67]:
overview = pd.DataFrame(columns = ['Source', '#Issues', '#Links', '#Linktypes', '#Projects', '%IssuesWithLinks', "%CrossProject"])
link_dist = pd.DataFrame(columns=['source', 'linktype', 'counts'])
j = 0
for s in SOURCES:
    i, l, ltu, p, pi, cr, cp_df = print_overview(s)
    overview.loc[j] = [s, i, l, ltu, p, pi, cr]
    
    j+=1
    
    frames = [link_dist, cp_df]
    #concatenate dataframes
    link_dist = pd.concat(frames, sort=False)
    
overview.set_index('Source').to_csv('data/repo_overview.csv', encoding="UTF-8", sep=",")

  temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
  temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
  temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

  temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
  temp = temp.merge(projects, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[link_index[0]] = cross_ratio_temp
 

In [72]:
overview

Unnamed: 0,Source,#Issues,#Links,#Linktypes,#Projects,%IssuesWithLinks,%CrossProject
0,Apache,1014926,255767,16,646,0.285,5.23
1,Hyperledger,28146,16304,8,32,0.549,4.62
2,IntelDAOS,9474,2599,11,2,0.308,3.27
3,JFrog,15535,3229,10,10,0.286,8.24
4,Jira,274545,99819,16,30,0.467,43.42
5,JiraEcosystem,41866,11398,14,101,0.33,6.77
6,MariaDB,31229,14618,8,11,0.445,2.54
7,Mindville,2134,44,4,7,0.04,4.55
8,Mojang,420819,215527,5,8,0.537,5.43
9,MongoDB,137172,63821,14,27,0.452,19.09


## Preperations for Table 2: Frequency of Link Types

In [55]:
def print_overview_linktypes(SOURCE):  
    
    issues = issue_dict[SOURCE]
    links = link_dict[SOURCE]
    
    issue_set = set(issues.index.values)
    link_set = set(links['issue_id_1']).union(set(links['issue_id_2']))
    
    links['mappedtype'] = links['linktype'].map(fine_linktype_map)
            
    projects = issues[['projectid', 'issue_id']]

    temp = links.merge(projects, left_on = 'issue_id_1', right_on='issue_id')
    temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))

    temp['sameproject'] = (temp["projectid_1"]==temp["projectid_2"])

    cp_df = links.mappedtype.value_counts().rename_axis('mappedtype').reset_index(name='counts')
    cp_df['source'] = SOURCE
    cp_df['percentageOfLinks'] = cp_df['counts'].apply(lambda x: round(x/len(links),10)*100)
    cp_df['crossproject'] = 0
    for i in range(len(cp_df)):
        mappedType = cp_df['mappedtype'].iloc[i]
        link_temp = temp[temp['mappedtype'] == mappedType]
        
        try:
            cross_ratio_temp = round(link_temp['sameproject'].value_counts().loc[False]/len(link_temp)*100, 2)
        except:
            cross_ratio_temp = 0
        
        cp_df['crossproject'].iloc[i] = cross_ratio_temp
        
    return len(links.mappedtype.unique()), cp_df

In [56]:
overview_cl = pd.DataFrame(columns=['Source', '#Linktypes'])
link_dist_cl = pd.DataFrame(columns=['source', 'mappedtype', 'counts'])
j = 0
for s in SOURCES:
    ltu, cp_df = print_overview_linktypes(s)
    overview_cl.loc[j] = [s, ltu]
    
    j+=1
    
    frames = [link_dist_cl, cp_df]
    #concatenate dataframes
    link_dist_cl = pd.concat(frames, sort=False)

  temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[i] = cross_ratio_temp
  temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[i] = cross_ratio_temp
  temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

  temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[i] = cross_ratio_temp
  temp = temp.merge(projects, left_on = 'issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[i] = cross_ratio_temp
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cp_df['crossproject'].iloc[i] = cross_ratio_temp
  temp = temp.merge(projects, lef

In [57]:
link_dist_cl

Unnamed: 0,source,mappedtype,counts,percentageOfLinks,crossproject
0,Apache,Subtask,83783,32.757549,0.04
1,Apache,Relate,72342,28.284337,9.46
2,Apache,Duplicate,25925,10.136179,2.53
3,Apache,Block,15549,6.079361,12.35
4,Apache,Depend,13009,5.086270,14.51
...,...,...,...,...,...
2,Spring,Depend,1747,12.079934,28.91
3,Spring,Duplicate,1745,12.066104,3.67
4,Spring,Epic,1635,11.305490,0.00
5,Spring,Supercede,478,3.305214,3.77


### Get common link types and extract link type frequences per repository

In [58]:
common_lt = (link_dist_cl.mappedtype.value_counts()>=7).rename_axis('mappedtype').reset_index(name='valid')
common_lt_set = common_lt[common_lt['valid']==True]['mappedtype'].values
common_lt_set

array(['Relate', 'Duplicate', 'Subtask', 'Clone', 'Block', 'Depend',
       'Split', 'Epic', 'Incorporate', 'Cause', 'Bonfire Testing'],
      dtype=object)

In [60]:
common_lt_set = ['Relate', 'Duplicate', 'Subtask', 'Clone', 'Block', 'Depend', 'Epic', 'Split', 'Incorporate', 'Bonfire Testing', 'Cause']
commontypes = link_dist_cl[link_dist_cl['mappedtype'].isin(common_lt_set)]
commontypes.groupby(['mappedtype']).agg({'counts': ['sum'], 'percentageOfLinks': ['mean'], 'crossproject': ['mean']})

Unnamed: 0_level_0,counts,percentageOfLinks,crossproject
Unnamed: 0_level_1,sum,mean,mean
mappedtype,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Block,39567,7.040437,10.54
Bonfire Testing,1049,1.344641,3.87
Cause,10687,3.204718,14.441429
Clone,28307,4.006623,14.245714
Depend,38922,8.592757,16.754
Duplicate,268945,18.449904,3.829333
Epic,39892,14.517931,1.765
Incorporate,26291,4.430296,6.075
Relate,255226,34.81491,11.31625
Split,4061,1.132109,12.653333


## Table 2: Frequency of Link Types per Repository

In [61]:
#easier to copy for Latex
pivot = pd.pivot_table(commontypes, values='percentageOfLinks', index=['source'],
                    columns=['mappedtype'], aggfunc=np.sum)
pivot_new = pd.DataFrame()
pivot_new = pivot[common_lt_set]

pivot_freq = pivot_new

pivot_new['Coverage'] = pivot_new.sum(axis=1)

m = pivot_new.mean().values
print(m)
s = pivot_new.std().values
print(s)

# pivot_new.fillna(0, inplace = True)



pivot_new.loc['Mean'] = m
pivot_new.loc['Standard Deviation'] = s

pivot_new

[34.81491028 18.44990405 20.89187953  4.00662292  7.04043665  8.59275746
 14.51793117  1.13210925  4.43029637  1.3446408   3.20471755 96.20992807]
[14.30033725 21.54601733 13.76280025  5.35862391  8.08474342  7.19685009
 12.53953079  2.1463647   2.99236989  2.98841747  1.86400358  3.7068529 ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new['Coverage'] = pivot_new.sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new.loc['Mean'] = m
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new.loc['Standard Deviation'] = s


mappedtype,Relate,Duplicate,Subtask,Clone,Block,Depend,Epic,Split,Incorporate,Bonfire Testing,Cause,Coverage
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Apache,28.284337,10.136179,32.757549,1.689428,6.079361,5.08627,4.889607,0.010166,4.078321,0.030887,1.214387,94.256491
Hyperledger,17.216634,3.91315,27.563788,2.925662,8.243376,,39.616045,0.47841,,0.042934,,100.0
IntelDAOS,39.322816,9.696037,10.542516,1.462101,25.548288,,,,,,,86.571758
JFrog,27.376897,19.913286,36.048312,0.836172,,7.928151,,,1.362651,,,93.465469
Jira,63.76241,21.724321,2.477484,2.871197,0.98879,0.165299,,0.181328,2.453441,0.227412,1.787235,96.638916
JiraEcosystem,22.925075,15.27461,20.038603,1.763467,5.948412,1.114231,24.249868,1.193192,1.789788,0.894894,3.860326,99.052465
MariaDB,51.060337,9.399371,6.095225,,12.990833,,6.44411,0.150499,7.894377,,5.965248,100.0
Mindville,43.181818,38.636364,,15.909091,2.272727,,,,,,,100.0
Mojang,9.4545,90.00682,,0.301586,0.110891,,,,,0.126202,,100.0
MongoDB,39.910061,13.454819,1.391392,0.31651,,22.860814,15.902289,1.178296,,,1.695367,96.709547


In [62]:
def round_nan(x, n):
    try:
        return str(round(x,1))
    except:
        return " "

In [63]:
pivot_new.round(1)


pivot_new['Relate'] = pivot_new['Relate'].apply(lambda x: round_nan(x,2))
pivot_new['Subtask'] = pivot_new['Subtask'].apply(lambda x: round_nan(x,2))
pivot_new['Duplicate'] = pivot_new['Duplicate'].apply(lambda x: round_nan(x,2))
pivot_new['Clone'] = pivot_new['Clone'].apply(lambda x: round_nan(x,2))
pivot_new['Depend'] = pivot_new['Depend'].apply(lambda x: round_nan(x,2))
pivot_new['Epic'] = pivot_new['Epic'].apply(lambda x: round_nan(x,2))
pivot_new['Split'] = pivot_new['Split'].apply(lambda x: round_nan(x,2))
pivot_new['Bonfire Testing'] = pivot_new['Bonfire Testing'].apply(lambda x: round_nan(x,2))
pivot_new['Cause'] = pivot_new['Cause'].apply(lambda x: round_nan(x,2))
pivot_new['Block'] = pivot_new['Block'].apply(lambda x: round_nan(x,2))
pivot_new['Incorporate'] = pivot_new['Incorporate'].apply(lambda x: round_nan(x,2))
pivot_new['Coverage'] = pivot_new['Coverage'].apply(lambda x: round_nan(x,2))

print(pivot_new.round(2).to_latex())

\begin{tabular}{lllllllllllll}
\toprule
mappedtype & Relate & Duplicate & Subtask & Clone & Block & Depend &  Epic & Split & Incorporate & Bonfire Testing & Cause & Coverage \\
source             &        &           &         &       &       &        &       &       &             &                 &       &          \\
\midrule
Apache             &   28.3 &      10.1 &    32.8 &   1.7 &   6.1 &    5.1 &   4.9 &   0.0 &         4.1 &             0.0 &   1.2 &     94.3 \\
Hyperledger        &   17.2 &       3.9 &    27.6 &   2.9 &   8.2 &    nan &  39.6 &   0.5 &         nan &             0.0 &   nan &    100.0 \\
IntelDAOS          &   39.3 &       9.7 &    10.5 &   1.5 &  25.5 &    nan &   nan &   nan &         nan &             nan &   nan &     86.6 \\
JFrog              &   27.4 &      19.9 &    36.0 &   0.8 &   nan &    7.9 &   nan &   nan &         1.4 &             nan &   nan &     93.5 \\
Jira               &   63.8 &      21.7 &     2.5 &   2.9 &   1.0 &    0.2 &   nan &   0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new['Relate'] = pivot_new['Relate'].apply(lambda x: round_nan(x,2))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new['Subtask'] = pivot_new['Subtask'].apply(lambda x: round_nan(x,2))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pivot_new['Duplicate'] = pivot_new['Duplicate'].apply(l

In [64]:
pivot_new

mappedtype,Relate,Duplicate,Subtask,Clone,Block,Depend,Epic,Split,Incorporate,Bonfire Testing,Cause,Coverage
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Apache,28.3,10.1,32.8,1.7,6.1,5.1,4.9,0.0,4.1,0.0,1.2,94.3
Hyperledger,17.2,3.9,27.6,2.9,8.2,,39.6,0.5,,0.0,,100.0
IntelDAOS,39.3,9.7,10.5,1.5,25.5,,,,,,,86.6
JFrog,27.4,19.9,36.0,0.8,,7.9,,,1.4,,,93.5
Jira,63.8,21.7,2.5,2.9,1.0,0.2,,0.2,2.5,0.2,1.8,96.6
JiraEcosystem,22.9,15.3,20.0,1.8,5.9,1.1,24.2,1.2,1.8,0.9,3.9,99.1
MariaDB,51.1,9.4,6.1,,13.0,,6.4,0.2,7.9,,6.0,100.0
Mindville,43.2,38.6,,15.9,2.3,,,,,,,100.0
Mojang,9.5,90.0,,0.3,0.1,,,,,0.1,,100.0
MongoDB,39.9,13.5,1.4,0.3,,22.9,15.9,1.2,,,1.7,96.7
