In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 'JiraEcosystem', 'MariaDB', 'Mindville', 'MongoDB', 'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [4]:
def add_linked_issues_to_df(df):
    df['issues']=''
    for i in tqdm(range(len(df))):
        df["issues"].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [5]:
def load_data(source):
    #Loading Issues
    filename = '../data/crawl/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=['issue_id'])

    #Loading Links
    filename = '../data/crawl/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    return issue_df, link_df

In [6]:
def clean_issues(issue_df):
    #Remove issues with empty titles
    issue_df = issue_df[~issue_df['title'].fillna(' ').str.isspace()]
    print(f'After filtering out issues with empty titles, {len(issue_df)} issues remain')
    
    return issue_df

In [109]:
def clean_links(link_df):
    add_linked_issues_to_df(link_df)
    
    # remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    x = len(link_df)
    
    # cleanup links
    # only allow one linktype per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
#     print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues']==i]['linktype']))>1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')
    
    print(round((x-len(link_df))/x)*100,2)

    del_mult_lt = (x-len(link_df))/x
    
    #Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df, del_mult_lt

In [110]:
sum_dml = 0
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
    
    issue_df = clean_issues(issue_df)
    
    link_df, del_mult_lt = clean_links(link_df)
    
    sum_dml+=del_mult_lt
    
    print(f'Cleaned {len(issue_df)} issues and {len(link_df)} links')
    
#     link_df.to_cs v('../data/crawl/clean_links_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    print("----------------------------")
print(sum_dml/15)

APACHE
Loaded 970929 issues and 250890 links
After filtering out issues with empty titles, 970928 issues remain


100%|██████████| 250890/250890 [10:50<00:00, 385.58it/s]


Left with 250437 links after removing half-private links


100%|██████████| 2145/2145 [01:56<00:00, 18.39it/s]


Left with 243295 links after removing issue-pairs with multiple link types between them
0 2
Left with 242823 links after removing issue-pairs with multiple entries
Cleaned 970928 issues and 242823 links
----------------------------
HYPERLEDGER
Loaded 27914 issues and 16765 links
After filtering out issues with empty titles, 27914 issues remain


100%|██████████| 16765/16765 [00:09<00:00, 1725.98it/s]


Left with 16652 links after removing half-private links


100%|██████████| 140/140 [00:00<00:00, 213.13it/s]


Left with 16246 links after removing issue-pairs with multiple link types between them
0 2
Left with 16225 links after removing issue-pairs with multiple entries
Cleaned 27914 issues and 16225 links
----------------------------
INTELDAOS
Loaded 5557 issues and 3518 links
After filtering out issues with empty titles, 5557 issues remain


100%|██████████| 3518/3518 [00:01<00:00, 2042.29it/s]


Left with 3271 links after removing half-private links


100%|██████████| 13/13 [00:00<00:00, 486.73it/s]


Left with 3223 links after removing issue-pairs with multiple link types between them
0 2
Left with 3222 links after removing issue-pairs with multiple entries
Cleaned 5557 issues and 3222 links
----------------------------
JFROG
Loaded 14769 issues and 3289 links
After filtering out issues with empty titles, 14769 issues remain


100%|██████████| 3289/3289 [00:01<00:00, 2013.69it/s]


Left with 3280 links after removing half-private links


100%|██████████| 24/24 [00:00<00:00, 605.11it/s]


Left with 3210 links after removing issue-pairs with multiple link types between them
0 2
Left with 3206 links after removing issue-pairs with multiple entries
Cleaned 14769 issues and 3206 links
----------------------------
JIRA
Loaded 265343 issues and 108507 links
After filtering out issues with empty titles, 265341 issues remain


100%|██████████| 108507/108507 [02:21<00:00, 766.72it/s]


Left with 100795 links after removing half-private links


100%|██████████| 793/793 [00:16<00:00, 48.84it/s]


Left with 98390 links after removing issue-pairs with multiple link types between them
0 2
Left with 98122 links after removing issue-pairs with multiple entries
Cleaned 265341 issues and 98122 links
----------------------------
JIRAECOSYSTEM
Loaded 40602 issues and 11872 links
After filtering out issues with empty titles, 40601 issues remain


100%|██████████| 11872/11872 [00:06<00:00, 1858.53it/s]


Left with 11104 links after removing half-private links


100%|██████████| 63/63 [00:00<00:00, 326.50it/s]


Left with 10926 links after removing issue-pairs with multiple link types between them
0 2
Left with 10911 links after removing issue-pairs with multiple entries
Cleaned 40601 issues and 10911 links
----------------------------
MARIADB
Loaded 31229 issues and 14950 links
After filtering out issues with empty titles, 31229 issues remain


100%|██████████| 14950/14950 [00:08<00:00, 1834.52it/s]


Left with 14929 links after removing half-private links


100%|██████████| 98/98 [00:00<00:00, 284.36it/s]


Left with 14659 links after removing issue-pairs with multiple link types between them
0 2
Left with 14618 links after removing issue-pairs with multiple entries
Cleaned 31229 issues and 14618 links
----------------------------
MINDVILLE
Loaded 2134 issues and 46 links
After filtering out issues with empty titles, 2134 issues remain


100%|██████████| 46/46 [00:00<00:00, 1893.72it/s]


Left with 46 links after removing half-private links


0it [00:00, ?it/s]


Left with 44 links after removing issue-pairs with multiple link types between them
0 2
Left with 44 links after removing issue-pairs with multiple entries
Cleaned 2134 issues and 44 links
----------------------------
MONGODB
Loaded 90629 issues and 61877 links
After filtering out issues with empty titles, 90628 issues remain


100%|██████████| 61877/61877 [00:57<00:00, 1071.02it/s]


Left with 38482 links after removing half-private links


100%|██████████| 254/254 [00:02<00:00, 92.39it/s]


Left with 37581 links after removing issue-pairs with multiple link types between them
0 2
Left with 37545 links after removing issue-pairs with multiple entries
Cleaned 90628 issues and 37545 links
----------------------------
QT
Loaded 140237 issues and 37033 links
After filtering out issues with empty titles, 140237 issues remain


100%|██████████| 37033/37033 [00:31<00:00, 1181.22it/s]


Left with 36331 links after removing half-private links


100%|██████████| 150/150 [00:01<00:00, 99.55it/s] 


Left with 35877 links after removing issue-pairs with multiple link types between them
0 2
Left with 35855 links after removing issue-pairs with multiple entries
Cleaned 140237 issues and 35855 links
----------------------------
REDHAT
Loaded 315797 issues and 113853 links
After filtering out issues with empty titles, 315796 issues remain


100%|██████████| 113853/113853 [02:48<00:00, 674.63it/s]


Left with 109199 links after removing half-private links


100%|██████████| 1129/1129 [00:24<00:00, 46.65it/s]


Left with 106640 links after removing issue-pairs with multiple link types between them
0 2
Left with 106200 links after removing issue-pairs with multiple entries
Cleaned 315796 issues and 106200 links
----------------------------
SAKAI
Loaded 49204 issues and 19515 links
After filtering out issues with empty titles, 49204 issues remain


100%|██████████| 19515/19515 [00:11<00:00, 1724.92it/s]


Left with 19515 links after removing half-private links


100%|██████████| 135/135 [00:00<00:00, 220.30it/s]


Left with 19105 links after removing issue-pairs with multiple link types between them
0 2
Left with 19057 links after removing issue-pairs with multiple entries
Cleaned 49204 issues and 19057 links
----------------------------
SECONDLIFE
Loaded 1865 issues and 673 links
After filtering out issues with empty titles, 1865 issues remain


100%|██████████| 673/673 [00:00<00:00, 2156.30it/s]


Left with 673 links after removing half-private links


100%|██████████| 17/17 [00:00<00:00, 209.67it/s]


Left with 633 links after removing issue-pairs with multiple link types between them
0 2
Left with 630 links after removing issue-pairs with multiple entries
Cleaned 1865 issues and 630 links
----------------------------
SONATYPE
Loaded 77837 issues and 4719 links
After filtering out issues with empty titles, 77835 issues remain


100%|██████████| 4719/4719 [00:02<00:00, 1882.33it/s]


Left with 4356 links after removing half-private links


100%|██████████| 17/17 [00:00<00:00, 481.33it/s]


Left with 4290 links after removing issue-pairs with multiple link types between them
0 2
Left with 4289 links after removing issue-pairs with multiple entries
Cleaned 77835 issues and 4289 links
----------------------------
SPRING
Loaded 69100 issues and 14715 links
After filtering out issues with empty titles, 69100 issues remain


100%|██████████| 14715/14715 [00:08<00:00, 1807.96it/s]


Left with 14615 links after removing half-private links


100%|██████████| 52/52 [00:00<00:00, 271.31it/s]

Left with 14477 links after removing issue-pairs with multiple link types between them
0 2
Left with 14461 links after removing issue-pairs with multiple entries
Cleaned 69100 issues and 14461 links
----------------------------
0.023650177969120022





## Example of what exactly is cleaned

In [91]:
source = 'sakai'
issue_df, link_df = load_data(source)

print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
add_linked_issues_to_df(link_df)

# remove links with uncrawled and filtered issues
cl_link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
print(f'Left with {len(cl_link_df)} links after removing half-private links')
x = len(cl_link_df)

Loaded 49204 issues and 19515 links


100%|██████████| 19515/19515 [00:10<00:00, 1885.79it/s]

Left with 19515 links after removing half-private links





In [92]:
cl_link_df.name.value_counts()

SAK-34955_SAK-35012        2
GRBK-965_GRBK-960          2
GRBK-967_GRBK-960          2
SAK-25889_SAK-25890        2
SAK-9933_SAK-23877         2
                          ..
SAK-33519_SAK-26675        1
SAK-33987_SAK-33521        1
SAK-33522_SAK-32426        1
SAK-33962_SAK-33524        1
EVALSYS-128_EVALSYS-116    1
Name: name, Length: 19397, dtype: int64

In [93]:
cl_link_df[cl_link_df['name']=='SAK-34955_SAK-35012']

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
9890,SAK-34955_SAK-35012,Relate,SAK-34955,SAK-35012,"['SAK-34955', 'SAK-35012']"
10243,SAK-34955_SAK-35012,Subtask,SAK-34955,SAK-35012,"['SAK-34955', 'SAK-35012']"


In [99]:
cl_link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple links between them if they were in the correct order')

Left with 19102 links after removing issue-pairs with multiple links between them if they were in the correct order


In [94]:
# in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
doublelinks = (cl_link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])
print(len(valid_doubles))

253


In [95]:
valid_doubles

{"['DASH-380', 'DASH-391']",
 "['EVALSYS-1233', 'EVALSYS-1242']",
 "['EVALSYS-1472', 'EVALSYS-1532']",
 "['EVALSYS-741', 'EVALSYS-769']",
 "['EVALSYS-764', 'EVALSYS-766']",
 "['EVALSYS-895', 'EVALSYS-918']",
 "['GM-152', 'GM-153']",
 "['GRBK-11', 'GRBK-225']",
 "['GRBK-1116', 'GRBK-1117']",
 "['GRBK-1191', 'GRBK-1210']",
 "['GRBK-1192', 'GRBK-701']",
 "['GRBK-1236', 'GRBK-859']",
 "['GRBK-1276', 'GRBK-1277']",
 "['GRBK-414', 'GRBK-834']",
 "['GRBK-594', 'GRBK-668']",
 "['GRBK-654', 'GRBK-713']",
 "['GRBK-752', 'GRBK-804']",
 "['GRBK-800', 'GRBK-803']",
 "['GRBK-868', 'GRBK-874']",
 "['GRBK-960', 'GRBK-965']",
 "['GRBK-960', 'GRBK-967']",
 "['PROD-232', 'PROD-271']",
 "['QNA-67', 'QNA-90']",
 "['QUALTRICS-40', 'QUALTRICS-91']",
 "['SAK-10305', 'SAK-11795']",
 "['SAK-10427', 'SAK-12485']",
 "['SAK-10427', 'SAK-12503']",
 "['SAK-10568', 'SAK-10579']",
 "['SAK-10826', 'SAK-8706']",
 "['SAK-10962', 'SAK-1357']",
 "['SAK-11008', 'SAK-9924']",
 "['SAK-1121', 'SAK-1159']",
 "['SAK-11413', 'SAK

In [96]:
cl_link_df[(cl_link_df['issue_id_1'] == 'EVALSYS-741') | (cl_link_df['issue_id_2'] == 'EVALSYS-741')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
34295,EVALSYS-741_EVALSYS-769,Duplicate,EVALSYS-741,EVALSYS-769,"['EVALSYS-741', 'EVALSYS-769']"
34296,EVALSYS-769_EVALSYS-741,Incorporate,EVALSYS-769,EVALSYS-741,"['EVALSYS-741', 'EVALSYS-769']"


In [97]:
for i in valid_doubles:
    if len(set(cl_link_df[cl_link_df['issues']==i]['linktype']))>1:
        cl_link_df = cl_link_df[cl_link_df.issues != i]
    else:
        print(i)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

['SAK-24490', 'SAK-24508']
['SAK-16279', 'SAK-17059']
['SAK-16421', 'SAK-16907']
['SAK-11008', 'SAK-9924']
['EVALSYS-1233', 'EVALSYS-1242']
['SAK-5992', 'SAK-5993']
['GRBK-1276', 'GRBK-1277']
['GRBK-868', 'GRBK-874']
['GRBK-11', 'GRBK-225']
['SAK-28119', 'SAK-29913']
['SAK-1894', 'SAK-4528']
['SAK-19340', 'SAK-19465']
['SAK-26306', 'SAK-26352']
['SAK-44081', 'SAK-44082']
['SAK-28659', 'SAK-39744']
['SAK-40979', 'SAK-43362']
['SAK-1426', 'SAK-807']
['SAK-33431', 'SAK-33554']
['SAK-1357', 'SAK-2015']
['SAK-23437', 'SAK-23529']
['SAK-15813', 'SAK-16433']
['SAK-31123', 'SAK-33995']
['GRBK-1236', 'GRBK-859']
['SAK-5225', 'SAK-5226']
['SAK-29400', 'SAK-34002']
['SAK-24865', 'SAK-25250']
['SAK-7271', 'SAK-7616']
['SAK-27942', 'SAK-29007']
['SAK-35953', 'SAK-36308']
['GRBK-752', 'GRBK-804']
['SAK-1426', 'SAK-754']
['SAK-33995', 'SAK-42356']
['SAK-8034', 'SAK-8162']
['SAK-36638', 'SAK-36844']
['SAK-16091', 'SAK-22700']
['SAK-5296', 'SAK-7311']
['SAK-36885', 'SAK-37167']
['SAK-27774', 'SAK-28526

In [98]:
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

Left with 19102 links after removing issue-pairs with multiple link types between them


In [100]:
x-len(cl_link_df)

413

In [101]:
(x-len(cl_link_df))/x

0.02116320778888035

In [102]:
cl_link_df.issues.value_counts()

['SAK-8034', 'SAK-8162']          2
['SAK-34169', 'SAK-36547']        2
['SAK-16091', 'SAK-22700']        2
['SAK-27774', 'SAK-28526']        2
['SAK-24490', 'SAK-24508']        2
                                 ..
['SAK-32867', 'SAK-33537']        1
['SAK-30153', 'SAK-33537']        1
['SAK-27900', 'SAK-33541']        1
['SAK-34380', 'SAK-34967']        1
['EVALSYS-116', 'EVALSYS-128']    1
Name: issues, Length: 19054, dtype: int64

In [103]:
cl_link_df[(cl_link_df['issue_id_1'] == 'SAK-35060') | (cl_link_df['issue_id_2'] == 'SAK-35060')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
7863,SAK-35060_SAK-37046,Relate,SAK-35060,SAK-37046,"['SAK-35060', 'SAK-37046']"
8417,SAK-35665_SAK-35060,Relate,SAK-35665,SAK-35060,"['SAK-35060', 'SAK-35665']"
9148,SAK-35205_SAK-35060,Relate,SAK-35205,SAK-35060,"['SAK-35060', 'SAK-35205']"
9149,SAK-35060_SAK-35205,Relate,SAK-35060,SAK-35205,"['SAK-35060', 'SAK-35205']"
9341,SAK-35136_SAK-35060,Relate,SAK-35136,SAK-35060,"['SAK-35060', 'SAK-35136']"
9665,SAK-35052_SAK-35060,Cloners,SAK-35052,SAK-35060,"['SAK-35052', 'SAK-35060']"
9666,SAK-35060_SAK-34968,Depend,SAK-35060,SAK-34968,"['SAK-34968', 'SAK-35060']"
9669,SAK-35060_SAK-34877,Relate,SAK-35060,SAK-34877,"['SAK-34877', 'SAK-35060']"


In [104]:
#Multiple links complete remove
cl_link_df.drop_duplicates(subset=['issues'], inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with the same link type with multiple entries')

cl_link_df.reset_index(inplace=True, drop=True)

Left with 19054 links after removing issue-pairs with the same link type with multiple entries
