In [1]:
import pandas as pd
from tqdm import tqdm
import os
if not os.path.exists('data/processed/'):
    os.makedirs('data/processed/')

In [2]:
tqdm.pandas()

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 
           'JiraEcosystem', 'MariaDB', 'Mojang', 'MongoDB', 
           'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [5]:
def load_data(source):
    #Loading Issues
    filename = '../data/raw/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=['issue_id'])

    #Loading Links
    filename = '../data/raw/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    return issue_df, link_df

In [4]:
# this function is to identify doubled issue pairs
def add_linked_issues_to_df(df):
    df['issues']=''
    for i in tqdm(range(len(df))):
        df["issues"].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [6]:
def clean_issues(issue_df):
    #Remove issues with empty titles
    issue_df = issue_df[~issue_df['title'].fillna(' ').str.isspace()]
    print(f'After filtering out issues with empty titles, {len(issue_df)} issues remain')
    
    return issue_df

In [7]:
def clean_links(link_df):    
    # remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    
    # cleanup links
    # only allow one linktype per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues']==i]['linktype']))>1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')

    #Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df

In [8]:
# Creatres non-links from randomly selected issues.
import random
def create_non_links(issue_df, link_df, linked_pairs):
   
    non_link_issues = set(issue_df[~issue_df['resolution'].isin(['Duplicate'])].index)
    no_issues = len(non_link_issues)
    
    cols = ['name', 'linktype', 'issue_id_1', 'issue_id_2', 'issues']
    non_links_df = pd.DataFrame(columns = cols)

    for i in tqdm(range(int(link_df.linktype.value_counts().mean()))):
        sample = random.sample(non_link_issues, 2)
        if not set([sample[0], sample[1]]) in (linked_pairs):
            name = str(sample[0]) + "_" + str(sample[1])
            non_links_df = non_links_df.append({
                "name": name,
                "linktype": "Non-Link",
                "issue_id_1":  sample[0],
                "issue_id_2":  sample[1],
                "issues": str(sorted(set([sample[0], sample[1]])))
            }, ignore_index=True)
        else:
            print('hi')
        
    link_plus_df = link_df.append(non_links_df, ignore_index=True)
    
    return link_plus_df

In [18]:
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    issue_df.to_csv('data/processed/issues_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
    
    issue_df = clean_issues(issue_df)
    
    add_linked_issues_to_df(link_df)
    
    linked_pairs = set(link_df['issues'])
    
    link_df = clean_links(link_df)
    link_plus_df = create_non_links(issue_df, link_df, linked_pairs)
    
    print(f'Cleaned {len(issue_df)} issues and {len(link_df)} links')
    
    link_df.to_csv('data/processed/links_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    
    print(link_df.linktype.value_counts())
    
    print(f'Created link and non_links of size {len(link_plus_df)}')
    
    link_plus_df.to_csv('data/processed/links_plus_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    print("----------------------------")
    print()

APACHE
Loaded 1014926 issues and 264107 links


  0%|          | 172/264107 [00:00<05:10, 850.84it/s]

After filtering out issues with empty titles, 1014925 issues remain


100%|██████████| 264107/264107 [06:09<00:00, 715.57it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
  0%|          | 0/2218 [00:00<?, ?it/s]

Left with 263647 links after removing half-private links
Left with 259717 links after removing issue-pairs with multiple links between them


100%|██████████| 2218/2218 [01:21<00:00, 27.17it/s]


Left with 256253 links after removing issue-pairs with multiple link types between them
Left with 255767 links after removing issue-pairs with multiple entries


100%|██████████| 11625/11625 [09:15<00:00, 20.92it/s]


Cleaned 1014925 issues and 255767 links
Subtask             83783
Reference           68973
Duplicate           25925
Blocker             14377
Epic-Relation       12506
dependent           12498
Incorporates         6923
Regression           4350
Cloners              4321
Required             3620
Container            3508
Related              3369
Supercedes           3248
Problem/Incident     3106
Child-Issue          2508
Blocked              1172
Completes             914
Dependent             399
Dependency            112
Testing                79
Parent Feature         50
Issue split            26
Name: linktype, dtype: int64
Created link and non_links of size 267392
----------------------------

HYPERLEDGER


  2%|▏         | 412/16846 [00:00<00:07, 2058.50it/s]

Loaded 28146 issues and 16846 links
After filtering out issues with empty titles, 28146 issues remain


100%|██████████| 16846/16846 [00:08<00:00, 1985.63it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
 21%|██        | 29/141 [00:00<00:00, 283.81it/s]

Left with 16733 links after removing half-private links
Left with 16565 links after removing issue-pairs with multiple links between them


100%|██████████| 141/141 [00:00<00:00, 352.72it/s]
  5%|▍         | 92/2038 [00:00<00:04, 453.52it/s]

Left with 16325 links after removing issue-pairs with multiple link types between them
Left with 16304 links after removing issue-pairs with multiple entries


100%|██████████| 2038/2038 [00:04<00:00, 439.83it/s]


Cleaned 28146 issues and 16304 links
Epic-Relation      6459
Subtask            4494
Relates            2807
Blocks             1344
Duplicate           638
Cloners             477
Issue split          78
Git Code Review       7
Name: linktype, dtype: int64
Created link and non_links of size 18342
----------------------------

INTELDAOS


 14%|█▍        | 368/2667 [00:00<00:01, 1889.18it/s]

Loaded 9474 issues and 2667 links
After filtering out issues with empty titles, 9474 issues remain


100%|██████████| 2667/2667 [00:01<00:00, 2175.34it/s]
100%|██████████| 24/24 [00:00<00:00, 1147.92it/s]
 56%|█████▋    | 112/199 [00:00<00:00, 554.69it/s]

Left with 2667 links after removing half-private links
Left with 2641 links after removing issue-pairs with multiple links between them
Left with 2605 links after removing issue-pairs with multiple link types between them
Left with 2599 links after removing issue-pairs with multiple entries


100%|██████████| 199/199 [00:00<00:00, 549.29it/s]


Cleaned 9474 issues and 2599 links
Related                 1016
Blocker                  663
Subtask                  274
Duplicate                252
Cloners (migrated)       176
Gantt End to Start       100
Verify                    39
Cloners                   38
Implement                 17
Gantt End to End          16
Relates                    6
Blocks                     1
Gantt Start to Start       1
Name: linktype, dtype: int64
Created link and non_links of size 2798
----------------------------

JFROG


 10%|█         | 336/3303 [00:00<00:01, 1700.58it/s]

Loaded 15535 issues and 3303 links
After filtering out issues with empty titles, 15535 issues remain


100%|██████████| 3303/3303 [00:01<00:00, 2152.88it/s]
100%|██████████| 24/24 [00:00<00:00, 1067.92it/s]
 37%|███▋      | 108/293 [00:00<00:00, 423.20it/s]

Left with 3303 links after removing half-private links
Left with 3273 links after removing issue-pairs with multiple links between them
Left with 3233 links after removing issue-pairs with multiple link types between them
Left with 3229 links after removing issue-pairs with multiple entries


100%|██████████| 293/293 [00:00<00:00, 488.28it/s]


Cleaned 15535 issues and 3229 links
Subtask                             1164
Relationship                         884
Duplicate                            643
Dependency                           256
Trigger                              202
Contains(WBSGantt)                    44
Cloners                               27
Gantt End to End                       4
Gantt End to Start                     3
Gantt Start to Start                   1
Finish-to-Finish link (WBSGantt)       1
Name: linktype, dtype: int64
Created link and non_links of size 3522
----------------------------

JIRA


  0%|          | 110/110507 [00:00<01:40, 1095.02it/s]

Loaded 274545 issues and 110507 links
After filtering out issues with empty titles, 274543 issues remain


100%|██████████| 110507/110507 [01:31<00:00, 1211.64it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
  1%|          | 7/819 [00:00<00:12, 64.28it/s]

Left with 102573 links after removing half-private links
Left with 101180 links after removing issue-pairs with multiple links between them


100%|██████████| 819/819 [00:09<00:00, 82.02it/s]
  0%|          | 0/5253 [00:00<?, ?it/s]

Left with 100096 links after removing issue-pairs with multiple link types between them
Left with 99819 links after removing issue-pairs with multiple entries


100%|██████████| 5253/5253 [00:48<00:00, 108.61it/s]


Cleaned 274543 issues and 99819 links
Reference          63347
Duplicate          21685
Cloners             2866
Subtask             2473
Part                2449
Detail              1870
Cause               1784
Blocker              987
Derived              518
Supersession         476
Regression           378
Relate               262
Bonfire Testing      227
Split                171
Depends              165
Resolve               64
Follows               49
Related               38
Issue split           10
Name: linktype, dtype: int64
Created link and non_links of size 105072
----------------------------

JIRAECOSYSTEM


  3%|▎         | 432/12439 [00:00<00:05, 2158.38it/s]

Loaded 41866 issues and 12439 links
After filtering out issues with empty titles, 41865 issues remain


100%|██████████| 12439/12439 [00:06<00:00, 2058.97it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
100%|██████████| 66/66 [00:00<00:00, 563.22it/s]
  8%|▊         | 44/569 [00:00<00:01, 431.90it/s]

Left with 11598 links after removing half-private links
Left with 11514 links after removing issue-pairs with multiple links between them
Left with 11414 links after removing issue-pairs with multiple link types between them
Left with 11398 links after removing issue-pairs with multiple entries


100%|██████████| 569/569 [00:01<00:00, 405.23it/s]


Cleaned 41865 issues and 11398 links
Epic-Relation                   2743
Relate                          2468
Subtask                         2284
Duplicate                       1741
Blocker                          676
Cause                            440
Part                             204
Cloners                          201
Reference                        145
Depends                          127
Split                            120
Follows                           99
Bonfire testing                   56
Bonfire Testing                   46
Epic                              21
Issue split                       16
Preceded By                        7
Blocks                             2
Polaris datapoint issue link       1
Polaris issue link                 1
Name: linktype, dtype: int64
Created link and non_links of size 11967
----------------------------

MARIADB


  3%|▎         | 410/14950 [00:00<00:07, 2053.66it/s]

Loaded 31229 issues and 14950 links
After filtering out issues with empty titles, 31229 issues remain


100%|██████████| 14950/14950 [00:07<00:00, 1981.24it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
100%|██████████| 98/98 [00:00<00:00, 528.33it/s]

Left with 14929 links after removing half-private links
Left with 14773 links after removing issue-pairs with multiple links between them
Left with 14659 links after removing issue-pairs with multiple link types between them



  5%|▌         | 94/1827 [00:00<00:03, 459.85it/s]

Left with 14618 links after removing issue-pairs with multiple entries


100%|██████████| 1827/1827 [00:04<00:00, 443.97it/s]


Cleaned 31229 issues and 14618 links
Relates             7464
Blocks              1899
Duplicate           1374
PartOf              1154
Epic-Relation        942
Subtask              891
Problem/Incident     872
Issue split           22
Name: linktype, dtype: int64
Created link and non_links of size 16445
----------------------------

MINDVILLE


100%|██████████| 46/46 [00:00<00:00, 1960.51it/s]
0it [00:00, ?it/s]
100%|██████████| 11/11 [00:00<00:00, 596.14it/s]

Loaded 2134 issues and 46 links
After filtering out issues with empty titles, 2134 issues remain
Left with 46 links after removing half-private links
Left with 44 links after removing issue-pairs with multiple links between them
Left with 44 links after removing issue-pairs with multiple link types between them
Left with 44 links after removing issue-pairs with multiple entries
Cleaned 2134 issues and 44 links
Relates      19
Duplicate    17
Cloners       7
Blocks        1
Name: linktype, dtype: int64
Created link and non_links of size 55





----------------------------

MOJANG
Loaded 420819 issues and 215821 links


  0%|          | 190/215821 [00:00<03:47, 946.04it/s]

After filtering out issues with empty titles, 420806 issues remain


100%|██████████| 215821/215821 [04:11<00:00, 857.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
  0%|          | 0/73 [00:00<?, ?it/s]

Left with 215802 links after removing half-private links
Left with 215658 links after removing issue-pairs with multiple links between them


100%|██████████| 73/73 [00:02<00:00, 34.37it/s]


Left with 215542 links after removing issue-pairs with multiple link types between them
Left with 215527 links after removing issue-pairs with multiple entries


100%|██████████| 43105/43105 [07:52<00:00, 91.32it/s] 


Cleaned 420806 issues and 215527 links
Duplicate          193989
Relates             20377
Cloners               650
Bonfire Testing       272
Blocks                239
Name: linktype, dtype: int64
Created link and non_links of size 258632
----------------------------

MONGODB


  0%|          | 233/92362 [00:00<01:17, 1191.27it/s]

Loaded 137172 issues and 92362 links
After filtering out issues with empty titles, 137171 issues remain


100%|██████████| 92362/92362 [01:12<00:00, 1276.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
  3%|▎         | 11/389 [00:00<00:03, 105.47it/s]

Left with 65240 links after removing half-private links
Left with 64537 links after removing issue-pairs with multiple links between them


100%|██████████| 389/389 [00:03<00:00, 107.69it/s]
  1%|          | 53/4254 [00:00<00:16, 260.38it/s]

Left with 63883 links after removing issue-pairs with multiple link types between them
Left with 63821 links after removing issue-pairs with multiple entries


100%|██████████| 4254/4254 [00:17<00:00, 243.71it/s]


Cleaned 137171 issues and 63821 links
Related                 25471
Depends                 13933
Epic-Relation           10149
Duplicate                8587
Documented               1825
Problem/Incident         1082
Subtask                   888
Issue split               752
Gantt Dependency          657
Cloners                   202
Backports                 170
Tested                     62
Gantt End to End           41
Gantt Start to Start        1
Initiative                  1
Name: linktype, dtype: int64
Created link and non_links of size 68075
----------------------------

QT


  1%|          | 292/41426 [00:00<00:27, 1485.98it/s]

Loaded 148579 issues and 41426 links
After filtering out issues with empty titles, 148579 issues remain


100%|██████████| 41426/41426 [00:24<00:00, 1669.79it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
 11%|█         | 19/171 [00:00<00:00, 180.49it/s]

Left with 40646 links after removing half-private links
Left with 40424 links after removing issue-pairs with multiple links between them


100%|██████████| 171/171 [00:00<00:00, 181.61it/s]
  1%|▏         | 47/3342 [00:00<00:14, 232.24it/s]

Left with 40128 links after removing issue-pairs with multiple link types between them
Left with 40105 links after removing issue-pairs with multiple entries


100%|██████████| 3342/3342 [00:15<00:00, 218.77it/s]


Cleaned 148579 issues and 40105 links
Subtask           9804
Relates           8990
Dependency        6260
Epic-Relation     5428
Duplicate         4243
Work Breakdown    2667
Replacement       2582
Test                50
Cloners             34
Issue split         31
Blocks              14
Covered              2
Name: linktype, dtype: int64
Created link and non_links of size 43447
----------------------------

REDHAT
Loaded 353000 issues and 127369 links
After filtering out issues with empty titles, 352999 issues remain


100%|██████████| 127369/127369 [01:53<00:00, 1125.98it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
  0%|          | 0/1205 [00:00<?, ?it/s]

Left with 123000 links after removing half-private links
Left with 121612 links after removing issue-pairs with multiple links between them


100%|██████████| 1205/1205 [00:18<00:00, 63.57it/s]


Left with 120136 links after removing issue-pairs with multiple link types between them
Left with 119669 links after removing issue-pairs with multiple entries


100%|██████████| 5698/5698 [01:42<00:00, 55.75it/s]


Cleaned 352999 issues and 119669 links
Related                          31006
Subtask                          24928
Blocks                           18186
Cloners                          16969
Superset                         10661
Duplicate                         5913
Sequence                          5129
Causality                         3168
Cloners (old)                     1504
Documentation                      670
Parent-Relation                    621
multi-level hierarchy [GANTT]      251
finish-start [GANTT]               184
Account                            166
Issue split                        162
Gantt: finish-start                 46
finish-finish [GANTT]               40
Gantt: start-finish                 40
Gantt: finish-finish                20
start-finish [GANTT]                 4
Gantt: start-start                   1
Name: linktype, dtype: int64
Created link and non_links of size 125367
----------------------------

SAKAI


  2%|▏         | 392/20292 [00:00<00:10, 1951.32it/s]

Loaded 50550 issues and 20292 links
After filtering out issues with empty titles, 50550 issues remain


100%|██████████| 20292/20292 [00:10<00:00, 1856.60it/s]
 27%|██▋       | 38/143 [00:00<00:00, 376.59it/s]

Left with 20292 links after removing half-private links
Left with 20040 links after removing issue-pairs with multiple links between them


100%|██████████| 143/143 [00:00<00:00, 390.85it/s]
  2%|▏         | 41/2475 [00:00<00:05, 406.77it/s]

Left with 19852 links after removing issue-pairs with multiple link types between them
Left with 19803 links after removing issue-pairs with multiple entries


100%|██████████| 2475/2475 [00:06<00:00, 382.39it/s]


Cleaned 50550 issues and 19803 links
1 - Relate             9711
Subtask                3373
5 - Depend             2578
3 - Duplicate          1851
4 - Incorporate        1334
2 - Cloned              949
6 - Blocks                6
7 - Git Code Review       1
Name: linktype, dtype: int64
Created link and non_links of size 22278
----------------------------

SECONDLIFE


 66%|██████▌   | 446/674 [00:00<00:00, 2228.52it/s]

Loaded 1867 issues and 674 links
After filtering out issues with empty titles, 1867 issues remain


100%|██████████| 674/674 [00:00<00:00, 2216.76it/s]
100%|██████████| 17/17 [00:00<00:00, 1566.41it/s]
100%|██████████| 105/105 [00:00<00:00, 594.63it/s]

Left with 674 links after removing half-private links
Left with 662 links after removing issue-pairs with multiple links between them
Left with 634 links after removing issue-pairs with multiple link types between them
Left with 631 links after removing issue-pairs with multiple entries





Cleaned 1867 issues and 631 links
Subtask         314
Relates         186
Cloners          48
Parent/Child     41
Depends          28
Collection       14
Name: linktype, dtype: int64
Created link and non_links of size 736
----------------------------

SONATYPE


  9%|▉         | 442/4975 [00:00<00:02, 2197.08it/s]

Loaded 87284 issues and 4975 links
After filtering out issues with empty titles, 87282 issues remain


100%|██████████| 4975/4975 [00:02<00:00, 2156.53it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
100%|██████████| 17/17 [00:00<00:00, 823.20it/s]
 15%|█▌        | 61/405 [00:00<00:01, 302.75it/s]

Left with 4534 links after removing half-private links
Left with 4498 links after removing issue-pairs with multiple links between them
Left with 4466 links after removing issue-pairs with multiple link types between them
Left with 4465 links after removing issue-pairs with multiple entries


100%|██████████| 405/405 [00:01<00:00, 296.83it/s]


Cleaned 87282 issues and 4465 links
Relates            1785
Subtask            1343
Bonfire Testing     361
Duplicate           342
Caused              235
dependent           162
Supercedes          108
Fixes               103
Implements           11
Epic-Relation         9
Issue split           6
Name: linktype, dtype: int64
Created link and non_links of size 4870
----------------------------

SPRING


  2%|▏         | 302/14716 [00:00<00:09, 1500.21it/s]

Loaded 69156 issues and 14716 links
After filtering out issues with empty titles, 69156 issues remain


100%|██████████| 14716/14716 [00:07<00:00, 1930.40it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
100%|██████████| 52/52 [00:00<00:00, 484.64it/s]
  2%|▏         | 38/1606 [00:00<00:04, 372.53it/s]

Left with 14616 links after removing half-private links
Left with 14550 links after removing issue-pairs with multiple links between them
Left with 14478 links after removing issue-pairs with multiple link types between them
Left with 14462 links after removing issue-pairs with multiple entries


100%|██████████| 1606/1606 [00:04<00:00, 354.80it/s]


Cleaned 69156 issues and 14462 links
Relate           5909
Subtask          1941
Duplicate        1745
Epic-Relation    1635
Depend           1259
Related           993
Depends           488
Supersede         478
Cloners            14
Name: linktype, dtype: int64
Created link and non_links of size 16068
----------------------------

