In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
SOURCES = ['Apache', 'Hyperledger', 'IntelDAOS', 'JFrog', 'Jira', 'JiraEcosystem', 'MariaDB', 'Mindville', 'MongoDB', 'Qt', 'RedHat', 'Sakai', 'SecondLife', 'Sonatype', 'Spring']

In [4]:
def add_linked_issues_to_df(df):
    df['issues']=''
    for i in tqdm(range(len(df))):
        df["issues"].iloc[i] = str(sorted(set([df.iloc[i]['issue_id_1'], df.iloc[i]['issue_id_2']])))

In [6]:
def load_data(source):
    #Loading Issues
    filename = '../data/crawl/issues_'+source.lower()+'.csv'
    issue_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';', index_col=['issue_id'])

    #Loading Links
    filename = '../data/crawl/links_'+source.lower()+'.csv'
    link_df = pd.read_csv(filename, encoding="UTF-8", low_memory=False, sep=';').drop_duplicates()
    
    return issue_df, link_df

In [8]:
def clean_issues(issue_df):
    #Remove issues with empty titles
    issue_df = issue_df[~issue_df['title'].fillna(' ').str.isspace()]
    print(f'After filtering out issues with empty titles, {len(issue_df)} issues remain')
    
    return issue_df

In [9]:
def clean_links(link_df):
    add_linked_issues_to_df(link_df)
    
    # remove links with uncrawled and filtered issues
    link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
    print(f'Left with {len(link_df)} links after removing half-private links')
    
    # cleanup links
    # only allow one linktype per issue-pair
    link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple links between them')

    # in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
    doublelinks = (link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
    valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

    for i in tqdm(valid_doubles):
        if len(set(link_df[link_df['issues']==i]['linktype']))>1:
            link_df = link_df[link_df.issues != i]
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple link types between them')

    #Multiple links complete remove
    link_df.drop_duplicates(subset=['issues'], inplace=True)
    print(f'Left with {len(link_df)} links after removing issue-pairs with multiple entries')

    link_df.reset_index(inplace=True, drop=True)
    
    return link_df

In [10]:
for s in SOURCES:
    print(s.upper())
    issue_df, link_df = load_data(s)
    print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')
    
    issue_df = clean_issues(issue_df)
    
    link_df = clean_links(link_df)
    
    print(f'Cleaned {len(issue_df)} issues and {len(link_df)} links')
    
    link_df.to_csv('../data/crawl/clean_links_'+s.lower()+'.csv', encoding='utf-8', index=True, sep=';')
    print("----------------------------")

APACHE
Loaded 970929 issues and 250890 links
After filtering out issues with empty titles, 970928 issues remain


100%|██████████| 250890/250890 [10:28<00:00, 398.96it/s]


Left with 250437 links after removing half-private links
Left with 246641 links after removing issue-pairs with multiple links between them


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
100%|██████████| 2145/2145 [01:42<00:00, 20.88it/s]


Left with 243295 links after removing issue-pairs with multiple link types between them
Left with 242823 links after removing issue-pairs with multiple entries
Cleaned 970928 issues and 242823 links
----------------------------
HYPERLEDGER
Loaded 27914 issues and 16765 links
After filtering out issues with empty titles, 27914 issues remain


100%|██████████| 16765/16765 [00:08<00:00, 1954.83it/s]


Left with 16652 links after removing half-private links
Left with 16484 links after removing issue-pairs with multiple links between them


100%|██████████| 140/140 [00:00<00:00, 278.53it/s]


Left with 16246 links after removing issue-pairs with multiple link types between them
Left with 16225 links after removing issue-pairs with multiple entries
Cleaned 27914 issues and 16225 links
----------------------------
INTELDAOS
Loaded 5557 issues and 3518 links
After filtering out issues with empty titles, 5557 issues remain


100%|██████████| 3518/3518 [00:01<00:00, 2290.54it/s]


Left with 3271 links after removing half-private links
Left with 3247 links after removing issue-pairs with multiple links between them


100%|██████████| 13/13 [00:00<00:00, 503.02it/s]


Left with 3223 links after removing issue-pairs with multiple link types between them
Left with 3222 links after removing issue-pairs with multiple entries
Cleaned 5557 issues and 3222 links
----------------------------
JFROG
Loaded 14769 issues and 3289 links
After filtering out issues with empty titles, 14769 issues remain


100%|██████████| 3289/3289 [00:01<00:00, 2350.71it/s]


Left with 3280 links after removing half-private links
Left with 3250 links after removing issue-pairs with multiple links between them


100%|██████████| 24/24 [00:00<00:00, 812.05it/s]


Left with 3210 links after removing issue-pairs with multiple link types between them
Left with 3206 links after removing issue-pairs with multiple entries
Cleaned 14769 issues and 3206 links
----------------------------
JIRA
Loaded 265343 issues and 108507 links
After filtering out issues with empty titles, 265341 issues remain


100%|██████████| 108507/108507 [01:48<00:00, 1002.01it/s]


Left with 100795 links after removing half-private links
Left with 99440 links after removing issue-pairs with multiple links between them


100%|██████████| 793/793 [00:14<00:00, 54.75it/s]


Left with 98390 links after removing issue-pairs with multiple link types between them
Left with 98122 links after removing issue-pairs with multiple entries
Cleaned 265341 issues and 98122 links
----------------------------
JIRAECOSYSTEM
Loaded 40602 issues and 11872 links
After filtering out issues with empty titles, 40601 issues remain


100%|██████████| 11872/11872 [00:05<00:00, 2241.04it/s]


Left with 11104 links after removing half-private links
Left with 11022 links after removing issue-pairs with multiple links between them


100%|██████████| 63/63 [00:00<00:00, 413.00it/s]


Left with 10926 links after removing issue-pairs with multiple link types between them
Left with 10911 links after removing issue-pairs with multiple entries
Cleaned 40601 issues and 10911 links
----------------------------
MARIADB
Loaded 31229 issues and 14950 links
After filtering out issues with empty titles, 31229 issues remain


100%|██████████| 14950/14950 [00:06<00:00, 2157.27it/s]


Left with 14929 links after removing half-private links
Left with 14773 links after removing issue-pairs with multiple links between them


100%|██████████| 98/98 [00:00<00:00, 376.97it/s]


Left with 14659 links after removing issue-pairs with multiple link types between them
Left with 14618 links after removing issue-pairs with multiple entries
Cleaned 31229 issues and 14618 links
----------------------------
MINDVILLE
Loaded 2134 issues and 46 links
After filtering out issues with empty titles, 2134 issues remain


100%|██████████| 46/46 [00:00<00:00, 2264.82it/s]


Left with 46 links after removing half-private links
Left with 44 links after removing issue-pairs with multiple links between them


0it [00:00, ?it/s]


Left with 44 links after removing issue-pairs with multiple link types between them
Left with 44 links after removing issue-pairs with multiple entries
Cleaned 2134 issues and 44 links
----------------------------
MONGODB
Loaded 90629 issues and 61877 links
After filtering out issues with empty titles, 90628 issues remain


100%|██████████| 61877/61877 [00:42<00:00, 1442.50it/s]


Left with 38482 links after removing half-private links
Left with 38017 links after removing issue-pairs with multiple links between them


100%|██████████| 254/254 [00:02<00:00, 111.09it/s]


Left with 37581 links after removing issue-pairs with multiple link types between them
Left with 37545 links after removing issue-pairs with multiple entries
Cleaned 90628 issues and 37545 links
----------------------------
QT
Loaded 140237 issues and 37033 links
After filtering out issues with empty titles, 140237 issues remain


100%|██████████| 37033/37033 [00:20<00:00, 1797.30it/s]


Left with 36331 links after removing half-private links
Left with 36133 links after removing issue-pairs with multiple links between them


100%|██████████| 150/150 [00:01<00:00, 119.74it/s]


Left with 35877 links after removing issue-pairs with multiple link types between them
Left with 35855 links after removing issue-pairs with multiple entries
Cleaned 140237 issues and 35855 links
----------------------------
REDHAT
Loaded 315797 issues and 113853 links
After filtering out issues with empty titles, 315796 issues remain


100%|██████████| 113853/113853 [02:02<00:00, 926.99it/s]


Left with 109199 links after removing half-private links
Left with 108018 links after removing issue-pairs with multiple links between them


100%|██████████| 1129/1129 [00:22<00:00, 49.28it/s]


Left with 106640 links after removing issue-pairs with multiple link types between them
Left with 106200 links after removing issue-pairs with multiple entries
Cleaned 315796 issues and 106200 links
----------------------------
SAKAI
Loaded 49204 issues and 19515 links
After filtering out issues with empty titles, 49204 issues remain


100%|██████████| 19515/19515 [00:16<00:00, 1169.11it/s]


Left with 19515 links after removing half-private links
Left with 19279 links after removing issue-pairs with multiple links between them


100%|██████████| 135/135 [00:00<00:00, 207.99it/s]


Left with 19105 links after removing issue-pairs with multiple link types between them
Left with 19057 links after removing issue-pairs with multiple entries
Cleaned 49204 issues and 19057 links
----------------------------
SECONDLIFE
Loaded 1865 issues and 673 links
After filtering out issues with empty titles, 1865 issues remain


100%|██████████| 673/673 [00:00<00:00, 1716.45it/s]


Left with 673 links after removing half-private links
Left with 661 links after removing issue-pairs with multiple links between them


100%|██████████| 17/17 [00:00<00:00, 837.93it/s]


Left with 633 links after removing issue-pairs with multiple link types between them
Left with 630 links after removing issue-pairs with multiple entries
Cleaned 1865 issues and 630 links
----------------------------
SONATYPE
Loaded 77837 issues and 4719 links
After filtering out issues with empty titles, 77835 issues remain


100%|██████████| 4719/4719 [00:02<00:00, 1835.19it/s]


Left with 4356 links after removing half-private links
Left with 4322 links after removing issue-pairs with multiple links between them


100%|██████████| 17/17 [00:00<00:00, 528.95it/s]


Left with 4290 links after removing issue-pairs with multiple link types between them
Left with 4289 links after removing issue-pairs with multiple entries
Cleaned 77835 issues and 4289 links
----------------------------
SPRING
Loaded 69100 issues and 14715 links
After filtering out issues with empty titles, 69100 issues remain


100%|██████████| 14715/14715 [00:07<00:00, 1859.37it/s]


Left with 14615 links after removing half-private links
Left with 14549 links after removing issue-pairs with multiple links between them


100%|██████████| 52/52 [00:00<00:00, 337.81it/s]


Left with 14477 links after removing issue-pairs with multiple link types between them
Left with 14461 links after removing issue-pairs with multiple entries
Cleaned 69100 issues and 14461 links
----------------------------


## Example of what exactly is cleaned

In [20]:
source = 'Apache'
issue_df, link_df = load_data(source)

print(f'Loaded {len(issue_df)} issues and {len(link_df)} links')

Loaded 970929 issues and 250890 links


In [21]:
add_linked_issues_to_df(link_df)

100%|██████████| 250890/250890 [05:49<00:00, 718.45it/s]


In [22]:
# remove links with uncrawled and filtered issues
cl_link_df = link_df[link_df[['issue_id_1', 'issue_id_2']].isin(issue_df.index.values).all(axis=1)]
print(f'Left with {len(cl_link_df)} links after removing half-private links')

Left with 250437 links after removing half-private links


In [23]:
cl_link_df.name.value_counts()

IMPALA-4835_IMPALA-6594      3
BEAM-1251_BEAM-4511          3
IMPALA-4835_IMPALA-6592      3
HAWQ-832_HAWQ-808            3
IMPALA-4835_IMPALA-6587      3
                            ..
KUDU-430_KUDU-1127           1
MESOS-3647_MESOS-3665        1
ARTEMIS-2687_ARTEMIS-2704    1
DATALAB-213_DATALAB-461      1
LUCENE-1465_LUCENE-1542      1
Name: name, Length: 248533, dtype: int64

In [24]:
cl_link_df[cl_link_df['name']=='IMPALA-4835_IMPALA-6594']

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
209312,IMPALA-4835_IMPALA-6594,Regression,IMPALA-4835,IMPALA-6594,"['IMPALA-4835', 'IMPALA-6594']"
209313,IMPALA-4835_IMPALA-6594,dependent,IMPALA-4835,IMPALA-6594,"['IMPALA-4835', 'IMPALA-6594']"
210015,IMPALA-4835_IMPALA-6594,Subtask,IMPALA-4835,IMPALA-6594,"['IMPALA-4835', 'IMPALA-6594']"


In [25]:
cl_link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple links between them')

Left with 246641 links after removing issue-pairs with multiple links between them


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cl_link_df.drop_duplicates(subset=['name'], keep=False, inplace=True)


In [34]:
# in case the name is the otherway around, like issue-1_issue-2 and issue-2_issue-1
doublelinks = (cl_link_df.issues.value_counts()>1).rename_axis('doubles').reset_index(name='valid')
valid_doubles = set(doublelinks[doublelinks['valid']==True]['doubles'])

In [35]:
valid_doubles

{"['SVN-3401', 'SVN-3818']",
 "['LANG-1124', 'LANG-1237']",
 "['YARN-3587', 'YARN-3598']",
 "['HADOOP-12693', 'HADOOP-16509']",
 "['CASSANDRA-2848', 'CASSANDRA-5683']",
 "['HIVE-16489', 'HIVE-17237']",
 "['SVN-1327', 'SVN-4240']",
 "['SVN-3504', 'SVN-3709']",
 "['HIVE-2935', 'HIVE-3122']",
 "['HUDI-305', 'HUDI-907']",
 "['SVN-1284', 'SVN-3357']",
 "['DRILL-2459', 'DRILL-3216']",
 "['REEF-1117', 'REEF-989']",
 "['FLINK-8592', 'FLINK-9653']",
 "['SVN-1374', 'SVN-1718']",
 "['WHIRR-598', 'WHIRR-673']",
 "['INFRA-13078', 'INFRA-13227']",
 "['HDFS-7826', 'HDFS-7937']",
 "['SPARK-10346', 'SPARK-12235']",
 "['THRIFT-1366', 'THRIFT-1388']",
 "['SVN-2754', 'SVN-2821']",
 "['PHOENIX-3181', 'PHOENIX-3252']",
 "['THRIFT-3301', 'THRIFT-3954']",
 "['PIG-4266', 'PIG-4304']",
 "['THRIFT-487', 'THRIFT-488']",
 "['OFBIZ-7073', 'OFBIZ-7538']",
 "['SVN-1974', 'SVN-3629']",
 "['MSITE-443', 'MSITE-444']",
 "['NIFI-2014', 'NIFI-2024']",
 "['SLING-4575', 'SLING-5698']",
 "['SPARK-27216', 'SPARK-27530']",
 "['

In [36]:
cl_link_df[(cl_link_df['issue_id_1'] == 'SVN-3401') | (cl_link_df['issue_id_2'] == 'SVN-3401')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
183828,LANG-1237_LANG-1124,Reference,LANG-1237,LANG-1124,"['LANG-1124', 'LANG-1237']"
183829,LANG-1124_LANG-1237,Reference,LANG-1124,LANG-1237,"['LANG-1124', 'LANG-1237']"


In [37]:
for i in valid_doubles:
    if len(set(cl_link_df[cl_link_df['issues']==i]['linktype']))>1:
        cl_link_df = cl_link_df[cl_link_df.issues != i]
    else:
        print(i)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with multiple link types between them')

  0%|          | 6/2145 [00:00<01:42, 20.87it/s]

['LANG-1124', 'LANG-1237']


  1%|          | 19/2145 [00:00<01:18, 27.10it/s]

['WHIRR-598', 'WHIRR-673']
['HDFS-7826', 'HDFS-7937']
['SPARK-10346', 'SPARK-12235']


  1%|          | 25/2145 [00:01<01:27, 24.14it/s]

['THRIFT-3301', 'THRIFT-3954']


  2%|▏         | 40/2145 [00:01<01:14, 28.26it/s]

['KAFKA-5141', 'KAFKA-8126']
['TUSCANY-2781', 'TUSCANY-2785']
['MNG-4792', 'WAGON-304']
['TIKA-1835', 'TIKA-1937']


  2%|▏         | 47/2145 [00:01<01:18, 26.79it/s]

['LOG4J2-407', 'LOG4J2-457']
['NIFI-1095', 'NIFI-1097']
['PDFBOX-2779', 'PDFBOX-3338']


  3%|▎         | 65/2145 [00:02<01:06, 31.08it/s]

['HIVE-774', 'HIVE-778']
['BIGTOP-1089', 'BIGTOP-1212']
['HADOOP-12855', 'HADOOP-12946']
['CB-6700', 'CB-6859']
['MESOS-2299', 'MESOS-2301']


  4%|▎         | 77/2145 [00:02<01:12, 28.49it/s]

['HIVE-21301', 'HIVE-21501']
['ACCUMULO-3297', 'ACCUMULO-97']
['SPARK-2947', 'SPARK-3224']
['USERGRID-536', 'USERGRID-541']


  4%|▍         | 92/2145 [00:03<01:10, 29.17it/s]

['KUDU-2966', 'KUDU-3240']
['MECLIPSE-156', 'MECLIPSE-636']
['DRILL-6439', 'DRILL-6559']


  5%|▍         | 99/2145 [00:03<01:17, 26.49it/s]

['LUCENE-1183', 'LUCENE-691']
['JUDDI-725', 'JUDDI-726']


  5%|▌         | 111/2145 [00:04<01:21, 24.98it/s]

['EDGENT-32', 'EDGENT-35']
['HBASE-6073', 'HBASE-8774']
['SOLR-162', 'SOLR-182']
['SENTRY-1706', 'SENTRY-1710']


  6%|▌         | 122/2145 [00:04<01:13, 27.56it/s]

['NUTCH-2752', 'NUTCH-2780']
['OFBIZ-4849', 'OFBIZ-5070']
['MESOS-2831', 'MESOS-2857']
['DERBY-1491', 'DERBY-396']


  6%|▌         | 127/2145 [00:04<01:04, 31.51it/s]

['HIVE-16844', 'HIVE-16908']
['HDFS-14042', 'HDFS-15556']


  6%|▋         | 135/2145 [00:05<01:13, 27.30it/s]

['KYLIN-850', 'KYLIN-873']
['GEODE-3958', 'GEODE-3959']


  6%|▋         | 139/2145 [00:05<01:12, 27.63it/s]

['HIVE-3403', 'HIVE-3784']


  8%|▊         | 161/2145 [00:06<01:20, 24.79it/s]

['KYLIN-2515', 'KYLIN-742']


  8%|▊         | 168/2145 [00:06<01:18, 25.24it/s]

['CAMEL-8789', 'CAMEL-9890']


  8%|▊         | 181/2145 [00:06<01:14, 26.40it/s]

['UIMA-5904', 'UIMA-5905']
['IO-380', 'IO-381']
['SPARK-2429', 'SPARK-2966']


  9%|▉         | 195/2145 [00:07<01:13, 26.48it/s]

['MAPREDUCE-6523', 'YARN-4283']


 10%|▉         | 213/2145 [00:08<01:19, 24.33it/s]

['CB-4036', 'CB-4490']


 11%|█         | 233/2145 [00:08<01:12, 26.32it/s]

['HDFS-14997', 'HDFS-15651']
['MRELEASE-812', 'MRELEASE-875']


 11%|█         | 240/2145 [00:09<01:07, 28.37it/s]

['PIG-4174', 'PIG-4206']
['VFS-119', 'VFS-614']


 12%|█▏        | 258/2145 [00:10<01:24, 22.45it/s]

['HDFS-5215', 'HDFS-8045']


 13%|█▎        | 271/2145 [00:10<01:16, 24.41it/s]

['IGNITE-801', 'IGNITE-803']


 13%|█▎        | 281/2145 [00:10<01:14, 25.04it/s]

['KYLIN-211', 'KYLIN-459']


 14%|█▎        | 291/2145 [00:11<01:11, 25.82it/s]

['SPARK-23929', 'SPARK-24324']
['SPARK-19941', 'SPARK-20628']
['IGNITE-6347', 'IGNITE-6356']


 14%|█▍        | 298/2145 [00:11<01:08, 27.10it/s]

['HIVE-3652', 'HIVE-3784']
['TEZ-1838', 'TEZ-2229']


 14%|█▍        | 306/2145 [00:11<01:06, 27.47it/s]

['IMPALA-4863', 'IMPALA-5311']


 15%|█▍        | 319/2145 [00:12<01:07, 27.14it/s]

['IOTDB-984', 'IOTDB-990']
['LENS-118', 'LENS-25']


 15%|█▌        | 326/2145 [00:12<01:03, 28.61it/s]

['SOLR-10321', 'SOLR-10993']
['OAK-2713', 'OAK-2967']


 16%|█▌        | 336/2145 [00:12<01:02, 29.05it/s]

['IMPALA-4923', 'IMPALA-5302']
['CTAKES-246', 'CTAKES-251']
['KAFKA-8063', 'KAFKA-8126']


 16%|█▋        | 351/2145 [00:13<00:56, 31.53it/s]

['MINIFICPP-137', 'MINIFICPP-159']
['CAMEL-9444', 'CAMEL-9573']
['PIG-1914', 'PIG-2641']
['TS-3667', 'TS-3788']
['HADOOP-13518', 'HADOOP-13830']


 17%|█▋        | 359/2145 [00:13<01:02, 28.50it/s]

['ISIS-788', 'ISIS-794']
['HIVE-3995', 'HIVE-3999']


 17%|█▋        | 370/2145 [00:14<01:08, 25.82it/s]

['CAMEL-1276', 'CAMEL-1537']
['JSPWIKI-159', 'JSPWIKI-850']


 18%|█▊        | 386/2145 [00:14<01:11, 24.67it/s]

['OAK-164', 'OAK-165']


 18%|█▊        | 390/2145 [00:15<01:07, 25.86it/s]

['LOG4J2-407', 'LOG4J2-442']
['JS2-1336', 'JS2-1357']


 19%|█▊        | 398/2145 [00:15<01:01, 28.62it/s]

['SOLR-9602', 'SOLR-9603']
['BIGTOP-1470', 'MAHOUT-1329']


 19%|█▉        | 405/2145 [00:15<01:03, 27.57it/s]

['MESOS-9309', 'MESOS-9310']
['ARTEMIS-1055', 'ARTEMIS-925']


 19%|█▉        | 413/2145 [00:15<01:01, 28.11it/s]

['MRM-1005', 'MRM-902']


 20%|█▉        | 426/2145 [00:16<01:06, 25.81it/s]

['HADOOP-7347', 'HDFS-2181']


 20%|██        | 436/2145 [00:16<01:05, 25.97it/s]

['DRILL-1125', 'DRILL-1447']


 21%|██        | 448/2145 [00:17<01:10, 24.24it/s]

['FLINK-6078', 'FLINK-6192']
['OAK-2801', 'OAK-2967']


 21%|██▏       | 458/2145 [00:17<01:04, 26.16it/s]

['AMQ-4234', 'AMQ-4235']
['SENTRY-2230', 'SENTRY-2273']
['HADOOP-15555', 'HADOOP-15556']


 22%|██▏       | 469/2145 [00:18<01:02, 26.86it/s]

['HIVE-1750', 'HIVE-1769']


 23%|██▎       | 488/2145 [00:18<01:07, 24.63it/s]

['TUSCANY-2782', 'TUSCANY-2785']


 23%|██▎       | 496/2145 [00:19<01:01, 26.61it/s]

['LUCENE-3920', 'LUCENE-4955']
['MESOS-2829', 'MESOS-2857']


 23%|██▎       | 503/2145 [00:19<01:01, 26.51it/s]

['HDFS-4163', 'YARN-207']
['HAWQ-288', 'HAWQ-289']
['HIVE-5775', 'HIVE-9132']
['HADOOP-16168', 'HDFS-14118']


 24%|██▍       | 514/2145 [00:19<01:05, 25.01it/s]

['CASSANDRA-3114', 'CASSANDRA-3186']


 24%|██▍       | 524/2145 [00:20<01:00, 26.92it/s]

['SPARK-26199', 'SPARK-31517']
['LOG4J2-407', 'LOG4J2-438']
['OOZIE-2317', 'OOZIE-2533']


 25%|██▌       | 544/2145 [00:21<01:06, 24.20it/s]

['KARAF-2765', 'KARAF-2816']
['HBASE-2428', 'HBASE-2479']


 26%|██▌       | 555/2145 [00:21<00:57, 27.84it/s]

['KYLIN-168', 'KYLIN-371']
['OAK-161', 'OAK-164']
['GROOVY-1628', 'GROOVY-3088']


 27%|██▋       | 575/2145 [00:22<01:00, 25.85it/s]

['AVRO-1188', 'AVRO-983']
['ISIS-800', 'ISIS-807']
['LOG4J2-457', 'LOG4J2-489']


 28%|██▊       | 595/2145 [00:23<01:03, 24.40it/s]

['MAPREDUCE-5830', 'MAPREDUCE-5857']


 28%|██▊       | 601/2145 [00:23<01:04, 23.87it/s]

['DERBY-5232', 'DERBY-5995']
['SOLR-8674', 'SOLR-9225']


 29%|██▉       | 617/2145 [00:23<00:52, 29.26it/s]

['HIVE-5020', 'HIVE-7282']
['HBASE-1353', 'HBASE-1710']
['HADOOP-7254', 'HDFS-1874']
['DRILL-5694', 'DRILL-5740']


 29%|██▉       | 631/2145 [00:24<00:54, 27.57it/s]

['DERBY-1490', 'DERBY-396']
['XBEAN-276', 'XBEAN-286']
['HADOOP-10557', 'HADOOP-9705']
['HADOOP-11505', 'HADOOP-11665']
['GROOVY-5808', 'GROOVY-6309']
['KYLIN-2094', 'KYLIN-2104']
['OAK-161', 'OAK-165']


 30%|███       | 644/2145 [00:24<00:51, 29.32it/s]

['TAJO-747', 'TAJO-748']
['KAFKA-5141', 'KAFKA-8063']


 31%|███       | 655/2145 [00:25<00:49, 30.13it/s]

['LUCENE-8004', 'LUCENE-8264']
['DERBY-5760', 'DERBY-6289']
['COUCHDB-103', 'COUCHDB-925']


 31%|███       | 663/2145 [00:25<00:51, 28.82it/s]

['HARMONY-1635', 'HARMONY-1810']
['DERBY-2594', 'DERBY-827']


 31%|███       | 670/2145 [00:25<00:53, 27.72it/s]

['OAK-3620', 'OAK-3649']


 32%|███▏      | 687/2145 [00:26<00:50, 28.91it/s]

['ARROW-7702', 'ARROW-8447']
['CAMEL-12114', 'CAMEL-12735']
['TAP5-1778', 'TAP5-2219']


 32%|███▏      | 691/2145 [00:26<00:46, 30.96it/s]

['CB-148', 'CB-316']
['AXIS2-3971', 'AXIS2-5204']


 33%|███▎      | 699/2145 [00:26<00:51, 28.24it/s]

['LUCENE-9435', 'LUCENE-9475']
['SPARK-11611', 'SPARK-11944']


 33%|███▎      | 707/2145 [00:27<00:50, 28.23it/s]

['IMPALA-1575', 'IMPALA-6153']
['AIRFLOW-3132', 'AIRFLOW-516']


 34%|███▍      | 725/2145 [00:27<00:55, 25.45it/s]

['SPARK-10602', 'SPARK-10641']
['GROOVY-1018', 'GROOVY-732']
['LOG4J2-438', 'LOG4J2-489']
['AVRO-1134', 'AVRO-1206']


 34%|███▍      | 739/2145 [00:28<00:53, 26.39it/s]

['NUTCH-1005', 'NUTCH-809']
['HBASE-7009', 'HBASE-7109']
['SPARK-10925', 'SPARK-14948']
['BUILDR-287', 'BUILDR-292']
['SPARK-4300', 'SPARK-9844']


 35%|███▌      | 761/2145 [00:29<00:54, 25.48it/s]

['IGNITE-5558', 'IGNITE-6235']
['HDFS-1125', 'HDFS-1290']
['NIFI-1011', 'NIFI-972']
['TS-4924', 'TS-4960']


 36%|███▌      | 769/2145 [00:29<00:48, 28.57it/s]

['TUSCANY-2781', 'TUSCANY-2784']
['TIKA-2804', 'TIKA-2854']
['DERBY-1515', 'DERBY-396']


 36%|███▌      | 777/2145 [00:29<00:42, 32.32it/s]

['SENTRY-2010', 'SENTRY-2011']
['CB-8996', 'CB-8999']
['HIVE-1006', 'HIVE-1008']


 37%|███▋      | 785/2145 [00:30<00:42, 32.23it/s]

['SOLR-13249', 'SOLR-13255']
['MESOS-1865', 'MESOS-3841']
['BEAM-191', 'BEAM-230']


 38%|███▊      | 810/2145 [00:31<00:47, 27.99it/s]

['XALANC-772', 'XERCESC-2086']
['HADOOP-6304', 'HADOOP-7110']
['YARN-6501', 'YARN-6502']


 38%|███▊      | 816/2145 [00:31<00:47, 27.70it/s]

['AXIS2-2275', 'AXIS2-2276']
['HIVE-4766', 'THRIFT-2046']


 38%|███▊      | 822/2145 [00:31<00:53, 24.85it/s]

['SOLR-10588', 'SOLR-10605']
['KAFKA-6824', 'KAFKA-7988']


 39%|███▊      | 828/2145 [00:31<00:52, 25.22it/s]

['HBASE-11747', 'HBASE-13825']


 39%|███▉      | 834/2145 [00:31<00:51, 25.31it/s]

['TAJO-67', 'TAJO-735']


 39%|███▉      | 844/2145 [00:32<00:50, 25.58it/s]

['CAMEL-1873', 'CAMEL-1874']


 40%|███▉      | 857/2145 [00:32<00:51, 24.93it/s]

['NUTCH-1076', 'NUTCH-1483']


 40%|████      | 863/2145 [00:33<00:50, 25.19it/s]

['KAFKA-1194', 'KAFKA-9458']
['SPARK-2242', 'SPARK-2244']


 41%|████      | 876/2145 [00:33<00:48, 26.38it/s]

['CAMEL-13314', 'CAMEL-8362']
['FLINK-7198', 'FLINK-8358']
['XALANJ-2219', 'XALANJ-2302']


 41%|████▏     | 890/2145 [00:34<00:47, 26.36it/s]

['OAK-3842', 'OAK-3919']


 42%|████▏     | 900/2145 [00:34<00:50, 24.84it/s]

['ISIS-383', 'ISIS-666']


 43%|████▎     | 912/2145 [00:35<00:53, 23.15it/s]

['HBASE-8049', 'HBASE-8144']


 43%|████▎     | 922/2145 [00:35<00:48, 24.96it/s]

['SPARK-18838', 'SPARK-18975']


 43%|████▎     | 931/2145 [00:35<00:47, 25.32it/s]

['LOG4J2-438', 'LOG4J2-442']
['NIFI-1377', 'NIFI-5169']


 44%|████▍     | 942/2145 [00:36<00:44, 26.93it/s]

['MNG-1753', 'MNG-4516']
['HBASE-8140', 'HCATALOG-623']


 44%|████▍     | 949/2145 [00:36<00:45, 26.32it/s]

['HADOOP-10768', 'HADOOP-14558']
['HBASE-23169', 'HBASE-23205']


 45%|████▍     | 956/2145 [00:36<00:43, 27.39it/s]

['TEZ-1160', 'TEZ-1534']
['KARAF-2434', 'KARAF-2435']
['DIRSERVER-309', 'DIRSERVER-711']


 45%|████▍     | 964/2145 [00:37<00:42, 27.55it/s]

['MNG-1753', 'MNG-3106']
['AXIS2-3964', 'AXIS2-4090']


 46%|████▌     | 977/2145 [00:37<00:44, 26.21it/s]

['SOLR-7525', 'SOLR-8125']
['HTTPCLIENT-1108', 'HTTPCLIENT-1109']


 46%|████▌     | 989/2145 [00:38<00:41, 27.88it/s]

['SOLR-7571', 'SOLR-7573']
['HBASE-12266', 'HBASE-13090']
['WW-2244', 'WW-2245']


 46%|████▋     | 997/2145 [00:38<00:39, 29.11it/s]

['HDFS-15359', 'HDFS-8999']
['HADOOP-8645', 'HADOOP-8900']


 47%|████▋     | 1014/2145 [00:38<00:36, 30.76it/s]

['FLUME-247', 'FLUME-6']
['HDFS-11751', 'HDFS-12834']
['MNG-3808', 'MSITE-402']
['IGNITE-6699', 'IGNITE-8447']


 48%|████▊     | 1040/2145 [00:39<00:43, 25.48it/s]

['FLINK-13344', 'FLINK-13888']


 49%|████▉     | 1050/2145 [00:40<00:43, 25.11it/s]

['OFBIZ-293', 'OFBIZ-4949']
['NIFI-1255', 'NIFI-1463']


 49%|████▉     | 1061/2145 [00:40<00:38, 28.09it/s]

['TUSCANY-2783', 'TUSCANY-2785']
['HADOOP-2835', 'HADOOP-8494']


 50%|█████     | 1075/2145 [00:41<00:34, 30.96it/s]

['BEAM-1345', 'BEAM-911']
['ISIS-1219', 'ISIS-2074']
['LUCENE-7407', 'LUCENE-7835']
['VFS-301', 'VFS-614']


 51%|█████     | 1090/2145 [00:41<00:36, 28.95it/s]

['HAWQ-288', 'HAWQ-290']
['OAK-2800', 'OAK-2967']
['TUSCANY-2782', 'TUSCANY-2783']
['SPARK-12423', 'SPARK-12979']


 51%|█████     | 1098/2145 [00:42<00:33, 31.16it/s]

['AIRFLOW-1131', 'AIRFLOW-966']
['AVRO-1965', 'HIVE-15316']
['SOLR-7572', 'SOLR-7573']


 52%|█████▏    | 1106/2145 [00:42<00:34, 30.49it/s]

['HARMONY-2039', 'HARMONY-2482']
['ATTIC-124', 'INFRA-7811']


 52%|█████▏    | 1114/2145 [00:42<00:34, 29.47it/s]

['LOG4J2-407', 'LOG4J2-489']
['HADOOP-9008', 'HDFS-4163']
['DISPATCH-1003', 'DISPATCH-1004']


 52%|█████▏    | 1122/2145 [00:42<00:33, 30.13it/s]

['MESOS-2815', 'MESOS-2831']
['ODFTOOLKIT-414', 'ODFTOOLKIT-455']


 52%|█████▏    | 1126/2145 [00:42<00:31, 31.90it/s]

['THRIFT-2157', 'THRIFT-2988']
['DERBY-5147', 'DERBY-5305']


 53%|█████▎    | 1138/2145 [00:43<00:33, 29.88it/s]

['HBASE-13109', 'PHOENIX-1731']
['LANG-1252', 'LANG-992']
['LENS-35', 'LENS-85']
['NIFI-731', 'NIFI-744']


 53%|█████▎    | 1146/2145 [00:43<00:38, 25.82it/s]

['DRILL-5504', 'DRILL-5526']


 54%|█████▎    | 1150/2145 [00:43<00:35, 28.25it/s]

['SOLR-4757', 'SOLR-4758']
['MESOS-2198', 'MESOS-4737']
['TAJO-580', 'TAJO-581']


 54%|█████▍    | 1158/2145 [00:44<00:35, 28.07it/s]

['MESOS-7079', 'MESOS-7175']


 54%|█████▍    | 1164/2145 [00:44<00:39, 24.85it/s]

['AMBARI-15538', 'AMBARI-17285']
['BATIK-1043', 'XGC-83']


 55%|█████▍    | 1175/2145 [00:44<00:33, 29.29it/s]

['KARAF-3324', 'KARAF-3334']
['SOLR-5007', 'SOLR-7289']
['HADOOP-1298', 'HADOOP-1701']
['SPARK-15156', 'SPARK-15157']


 55%|█████▌    | 1187/2145 [00:45<00:31, 30.36it/s]

['IMPALA-4039', 'IMPALA-5398']
['YARN-321', 'YARN-374']
['TUSCANY-2781', 'TUSCANY-2782']


 56%|█████▌    | 1197/2145 [00:45<00:35, 26.73it/s]

['HADOOP-7230', 'HDFS-1844']
['HDFS-3486', 'HDFS-3627']


 56%|█████▋    | 1211/2145 [00:46<00:32, 28.75it/s]

['KYLIN-1095', 'KYLIN-999']
['DERBY-6945', 'DERBY-6980']
['ATLAS-2012', 'ATLAS-2084']
['IMPALA-1604', 'IMPALA-1650']


 57%|█████▋    | 1222/2145 [00:46<00:34, 26.77it/s]

['OAK-2692', 'OAK-2967']


 57%|█████▋    | 1233/2145 [00:46<00:31, 29.32it/s]

['FLINK-8543', 'FLINK-8939']
['DRILL-5470', 'DRILL-5590']
['TUSCANY-2784', 'TUSCANY-2785']
['SPARK-10709', 'SPARK-11102']


 58%|█████▊    | 1245/2145 [00:47<00:29, 30.95it/s]

['NUTCH-1785', 'NUTCH-2032']
['HBASE-7334', 'HBASE-7386']
['CAMEL-507', 'CAMEL-872']


 59%|█████▉    | 1267/2145 [00:48<00:29, 29.50it/s]

['HIVE-10698', 'HIVE-9897']
['HARMONY-2066', 'HARMONY-2519']
['SOLR-4562', 'SOLR-4564']
['FLUME-27', 'FLUME-73']


 60%|█████▉    | 1278/2145 [00:48<00:29, 29.51it/s]

['IGNITE-2294', 'IGNITE-4268']
['SPARK-4521', 'SPARK-6607']
['GROOVY-8776', 'GROOVY-8777']


 60%|█████▉    | 1286/2145 [00:48<00:31, 27.60it/s]

['IGNITE-2294', 'IGNITE-4269']
['IMPALA-8526', 'IMPALA-8527']
['DRILL-4876', 'DRILL-4948']


 60%|██████    | 1294/2145 [00:49<00:26, 31.68it/s]

['HADOOP-6541', 'ZOOKEEPER-364']
['HIVE-16908', 'HIVE-17369']
['AIRFLOW-1805', 'AIRFLOW-1819']


 61%|██████    | 1306/2145 [00:49<00:27, 30.75it/s]

['NIFI-5622', 'NIFI-5623']
['MESOS-2815', 'MESOS-2829']
['MESOS-5332', 'MESOS-5361']
['HADOOP-10097', 'HIVE-5583']


 61%|██████▏   | 1316/2145 [00:49<00:31, 26.01it/s]

['BIGTOP-713', 'BIGTOP-834']
['NUMBERS-30', 'NUMBERS-54']


 62%|██████▏   | 1323/2145 [00:50<00:30, 27.31it/s]

['INFRA-16446', 'INFRA-16455']


 62%|██████▏   | 1334/2145 [00:50<00:27, 29.07it/s]

['PIG-3015', 'PIG-3111']
['RANGER-1738', 'RANGER-1935']
['KUDU-1535', 'KUDU-2604']


 63%|██████▎   | 1344/2145 [00:50<00:31, 25.77it/s]

['KAFKA-7957', 'KAFKA-7988']


 63%|██████▎   | 1354/2145 [00:51<00:31, 24.84it/s]

['LOG4J2-442', 'LOG4J2-457']


 64%|██████▎   | 1364/2145 [00:51<00:28, 27.47it/s]

['MESOS-3470', 'MESOS-3949']
['NETBEANS-2413', 'NETBEANS-53']
['NIFI-376', 'NIFI-609']


 65%|██████▍   | 1386/2145 [00:52<00:33, 22.85it/s]

['CASSANDRA-10095', 'CASSANDRA-10166']


 65%|██████▌   | 1396/2145 [00:53<00:29, 25.04it/s]

['HBASE-5305', 'HBASE-5443']
['AXIOM-66', 'AXIS2-1937']


 65%|██████▌   | 1404/2145 [00:53<00:25, 29.47it/s]

['JCLOUDS-765', 'JCLOUDS-774']
['SPARK-1121', 'SPARK-1441']
['ARTEMIS-687', 'ARTEMIS-699']
['IMPALA-1621', 'IMPALA-2241']


 66%|██████▌   | 1418/2145 [00:53<00:27, 26.03it/s]

['LENS-256', 'LENS-299']
['DRILL-3234', 'DRILL-3263']


 66%|██████▋   | 1424/2145 [00:54<00:27, 26.21it/s]

['HIVE-74', 'HIVE-824']
['HBASE-14175', 'HBASE-18844']


 67%|██████▋   | 1442/2145 [00:54<00:24, 28.58it/s]

['HIVE-20346', 'HIVE-20398']
['REEF-43', 'REEF-47']
['SPARK-20964', 'SPARK-26215']


 68%|██████▊   | 1449/2145 [00:54<00:25, 27.15it/s]

['NIFI-810', 'NIFI-891']


 68%|██████▊   | 1455/2145 [00:55<00:27, 25.45it/s]

['HIVE-2955', 'HIVE-4386']


 68%|██████▊   | 1465/2145 [00:55<00:26, 25.33it/s]

['KAFKA-7976', 'KAFKA-7988']


 69%|██████▉   | 1478/2145 [00:56<00:26, 25.24it/s]

['HIVE-645', 'HIVE-810']
['TAP5-1915', 'TAP5-1916']


 69%|██████▉   | 1487/2145 [00:56<00:22, 29.75it/s]

['HIVE-750', 'HIVE-815']
['MYFACES-1964', 'MYFACES-1965']
['CASSANDRA-6151', 'CASSANDRA-6238']
['MESOS-3057', 'MESOS-4930']


 70%|██████▉   | 1493/2145 [00:56<00:24, 27.12it/s]

['HIVE-3938', 'HIVE-4004']
['ACCUMULO-4000', 'ACCUMULO-4004']


 70%|██████▉   | 1501/2145 [00:56<00:22, 28.14it/s]

['OPENEJB-1065', 'OPENEJB-1066']
['CASSANDRA-13323', 'CASSANDRA-9289']


 70%|███████   | 1511/2145 [00:57<00:22, 27.71it/s]

['ATLAS-286', 'ATLAS-287']
['MESOS-2829', 'MESOS-2831']


 71%|███████   | 1518/2145 [00:57<00:23, 26.69it/s]

['HAWQ-289', 'HAWQ-290']


 71%|███████   | 1525/2145 [00:57<00:21, 28.67it/s]

['FALCON-1515', 'FALCON-1541']
['SPARK-10979', 'SPARK-9318']
['HARMONY-2530', 'HARMONY-3148']


 72%|███████▏  | 1536/2145 [00:58<00:21, 27.76it/s]

['DRILL-6167', 'DRILL-6168']
['NETBEANS-1474', 'NETBEANS-4192']


 72%|███████▏  | 1546/2145 [00:58<00:22, 26.08it/s]

['ARTEMIS-1425', 'ARTEMIS-1480']


 73%|███████▎  | 1562/2145 [00:59<00:21, 27.55it/s]

['SPARK-5403', 'SPARK-926']
['SPARK-34365', 'SPARK-34378']
['AMBARI-17107', 'AMBARI-17339']
['OAK-7728', 'OAK-7837']
['BEAM-6025', 'BEAM-6104']


 73%|███████▎  | 1574/2145 [00:59<00:19, 29.36it/s]

['PDFBOX-1668', 'PDFBOX-1726']
['HIVE-1681', 'HIVE-1710']


 74%|███████▎  | 1578/2145 [00:59<00:18, 31.25it/s]

['THRIFT-3207', 'THRIFT-3736']
['MAPREDUCE-6304', 'YARN-796']


 74%|███████▍  | 1585/2145 [01:00<00:20, 27.17it/s]

['HIVE-15489', 'HIVE-16336']
['DIRSTUDIO-452', 'DIRSTUDIO-453']
['GEODE-725', 'GEODE-768']
['HARMONY-2219', 'HARMONY-2366']


 74%|███████▍  | 1598/2145 [01:00<00:18, 29.92it/s]

['WICKET-6221', 'WICKET-6222']
['SPARK-17312', 'SPARK-17313']


 75%|███████▌  | 1609/2145 [01:00<00:18, 29.48it/s]

['DRILL-600', 'DRILL-746']
['HIVE-19064', 'HIVE-23172']
['SOLR-10079', 'SOLR-10354']


 76%|███████▌  | 1621/2145 [01:01<00:19, 26.71it/s]

['IMPALA-3189', 'IMPALA-5020']
['GROOVY-8008', 'GROOVY-8505']
['FLINK-18639', 'FLINK-18806']
['CASSANDRA-7507', 'CASSANDRA-7579']


 76%|███████▋  | 1637/2145 [01:01<00:18, 27.89it/s]

['NETBEANS-4071', 'NETBEANS-4134']
['DERBY-6019', 'DERBY-6174']


 77%|███████▋  | 1646/2145 [01:02<00:15, 33.24it/s]

['OFBIZ-7016', 'OFBIZ-7970']
['SOLR-8674', 'SOLR-9222']
['RATIS-11', 'RATIS-259']
['HADOOP-6562', 'HDFS-1466']
['SPARK-11137', 'SPARK-11139']
['SPARK-27296', 'SPARK-30423']


 77%|███████▋  | 1654/2145 [01:02<00:15, 30.80it/s]

['CAMEL-5599', 'CAMEL-6840']


 78%|███████▊  | 1665/2145 [01:02<00:18, 26.09it/s]

['THRIFT-1921', 'THRIFT-1927']


 78%|███████▊  | 1675/2145 [01:03<00:18, 25.20it/s]

['TIKA-2804', 'TIKA-2824']
['CRAIL-103', 'CRAIL-104']


 78%|███████▊  | 1683/2145 [01:03<00:17, 27.07it/s]

['CB-4341', 'CB-4348']
['MYRIAD-37', 'MYRIAD-83']


 79%|███████▊  | 1689/2145 [01:03<00:17, 25.59it/s]

['NUTCH-422', 'NUTCH-809']
['FLINK-13202', 'FLINK-13242']


 79%|███████▉  | 1700/2145 [01:04<00:16, 26.42it/s]

['CXF-6900', 'SANTUARIO-442']
['GROOVY-8299', 'GROOVY-8989']
['BEAM-7127', 'BEAM-7141']


 80%|███████▉  | 1707/2145 [01:04<00:15, 27.46it/s]

['SPARK-2290', 'SPARK-2454']
['GEODE-8567', 'GEODE-8573']
['LANG-1038', 'LANG-1252']


 80%|████████  | 1717/2145 [01:04<00:12, 34.38it/s]

['CXF-4592', 'CXF-4599']
['DERBY-1492', 'DERBY-396']
['YARN-4833', 'YARN-5484']
['FLUME-2286', 'FLUME-2921']


 81%|████████  | 1729/2145 [01:05<00:15, 27.72it/s]

['JCR-1713', 'TUSCANY-2533']


 81%|████████  | 1736/2145 [01:05<00:15, 26.90it/s]

['CURATOR-115', 'CURATOR-116']


 81%|████████▏ | 1744/2145 [01:05<00:13, 29.64it/s]

['MNG-5623', 'MNG-5626']
['AMQ-5646', 'AMQ-5661']
['LOG4J2-442', 'LOG4J2-489']


 82%|████████▏ | 1752/2145 [01:05<00:11, 33.06it/s]

['DERBY-1624', 'DERBY-681']
['THRIFT-4550', 'THRIFT-4766']
['SOLR-7571', 'SOLR-7572']
['DAEMON-280', 'DAEMON-307']
['HBASE-6572', 'HDFS-4672']


 82%|████████▏ | 1760/2145 [01:06<00:11, 32.26it/s]

['MPMD-40', 'MPMD-80']
['CAMEL-2421', 'CAMEL-2459']


 82%|████████▏ | 1768/2145 [01:06<00:11, 32.73it/s]

['DRILL-220', 'DRILL-456']
['HDFS-2576', 'HDFS-4778']
['FLUME-454', 'FLUME-503']


 83%|████████▎ | 1779/2145 [01:06<00:14, 25.72it/s]

['ZEPPELIN-1674', 'ZEPPELIN-1791']


 83%|████████▎ | 1791/2145 [01:07<00:14, 24.45it/s]

['SPARK-17296', 'SPARK-17384']


 84%|████████▍ | 1797/2145 [01:07<00:14, 24.85it/s]

['PDFBOX-1094', 'PDFBOX-1466']
['DRILL-3954', 'DRILL-3955']


 84%|████████▍ | 1809/2145 [01:08<00:12, 27.56it/s]

['DRILL-1487', 'DRILL-1908']
['HDFS-1150', 'HDFS-1326']
['TS-3082', 'TS-4468']


 85%|████████▍ | 1818/2145 [01:08<00:13, 24.20it/s]

['TAP5-2228', 'TAP5-2230']


 85%|████████▌ | 1824/2145 [01:08<00:13, 24.09it/s]

['OOZIE-2338', 'OOZIE-3432']


 86%|████████▌ | 1834/2145 [01:09<00:12, 25.14it/s]

['NETBEANS-2888', 'NETBEANS-2951']
['HADOOP-9008', 'YARN-207']
['TAVERNA-384', 'TAVERNA-488']


 86%|████████▌ | 1842/2145 [01:09<00:11, 27.02it/s]

['CAMEL-1276', 'CAMEL-2565']


 86%|████████▋ | 1852/2145 [01:09<00:10, 28.24it/s]

['INLONG-156', 'INLONG-78']
['LOG4J2-438', 'LOG4J2-457']


 87%|████████▋ | 1859/2145 [01:09<00:09, 29.50it/s]

['FLINK-11911', 'FLINK-15904']
['ASTERIXDB-1082', 'ASTERIXDB-1089']


 87%|████████▋ | 1867/2145 [01:10<00:10, 27.59it/s]

['INFRA-10304', 'INFRA-8147']
['YARN-5992', 'YARN-6000']
['MECLIPSE-576', 'MECLIPSE-621']
['TS-4893', 'TS-4894']
['SOLR-914', 'SOLR-924']


 88%|████████▊ | 1877/2145 [01:10<00:08, 31.50it/s]

['TUSCANY-2782', 'TUSCANY-2784']
['TS-5040', 'TS-5058']


 88%|████████▊ | 1892/2145 [01:11<00:09, 27.35it/s]

['AXIS2C-1156', 'AXIS2C-1220']


 88%|████████▊ | 1898/2145 [01:11<00:09, 25.43it/s]

['ASTERIXDB-2476', 'ASTERIXDB-2498']
['ARROW-6821', 'ARROW-7501']
['MNG-3391', 'MNG-3483']


 89%|████████▉ | 1907/2145 [01:11<00:07, 31.51it/s]

['AURORA-1014', 'AURORA-1824']
['HIVE-1750', 'HIVE-1770']
['IMPALA-4923', 'IMPALA-5150']
['NUTCH-769', 'NUTCH-770']


 90%|████████▉ | 1929/2145 [01:12<00:07, 28.92it/s]

['SOLR-5287', 'SOLR-791']
['NUTCH-1916', 'NUTCH-1931']
['IOTDB-519', 'IOTDB-684']


 90%|█████████ | 1936/2145 [01:12<00:06, 29.95it/s]

['HADOOP-9008', 'MAPREDUCE-4780']
['SAMZA-560', 'SAMZA-723']
['HDFS-1047', 'MAPREDUCE-1613']
['KAFKA-1554', 'KAFKA-2012']


 91%|█████████ | 1944/2145 [01:12<00:06, 30.85it/s]

['HDFS-4163', 'MAPREDUCE-4780']


 91%|█████████▏| 1959/2145 [01:13<00:06, 27.36it/s]

['AIRFLOW-2650', 'AIRFLOW-455']
['DERBY-1537', 'DERBY-645']
['TUSCANY-2781', 'TUSCANY-2783']
['KARAF-5628', 'KARAF-6377']


 92%|█████████▏| 1967/2145 [01:13<00:06, 27.62it/s]

['KAFKA-6127', 'KAFKA-6446']


 92%|█████████▏| 1976/2145 [01:14<00:07, 23.30it/s]

['AMQ-3573', 'AMQ-3574']


 93%|█████████▎| 1992/2145 [01:14<00:06, 23.62it/s]

['ZOOKEEPER-1676', 'ZOOKEEPER-2432']
['OPENJPA-2304', 'OPENJPA-2320']


 93%|█████████▎| 2004/2145 [01:15<00:04, 28.32it/s]

['HBASE-2514', 'HBASE-2681']
['GIRAPH-211', 'GIRAPH-262']
['MAPREDUCE-4780', 'YARN-207']
['PHOENIX-3554', 'PHOENIX-4373']


 94%|█████████▍| 2015/2145 [01:15<00:04, 26.66it/s]

['HBASE-5360', 'HBASE-6223']


 94%|█████████▍| 2024/2145 [01:16<00:04, 24.54it/s]

['AVRO-1241', 'PIG-3015']
['JUDDI-569', 'JUDDI-89']


 95%|█████████▍| 2033/2145 [01:16<00:04, 24.16it/s]

['AMBARI-15484', 'AMBARI-21484']


 95%|█████████▌| 2046/2145 [01:16<00:03, 27.44it/s]

['OAK-2202', 'OAK-2305']
['MPIR-290', 'MPIR-332']


 97%|█████████▋| 2070/2145 [01:17<00:02, 30.32it/s]

['CASSANDRA-13757', 'CASSANDRA-14284']
['HIVE-13063', 'HIVE-14581']
['BIGTOP-1493', 'BIGTOP-1497']
['FELIX-3277', 'FELIX-3324']
['AIRFLOW-129', 'AIRFLOW-1527']
['MYRIAD-156', 'MYRIAD-188']


 97%|█████████▋| 2080/2145 [01:18<00:01, 33.64it/s]

['KARAF-4935', 'KARAF-4967']
['HDFS-1973', 'HDFS-2292']
['HADOOP-7853', 'ZOOKEEPER-1373']
['CAMEL-6565', 'CAMEL-7092']


 97%|█████████▋| 2088/2145 [01:18<00:02, 28.31it/s]

['DERBY-5045', 'DERBY-5124']


 98%|█████████▊| 2096/2145 [01:18<00:01, 29.11it/s]

['SOLR-8674', 'SOLR-9229']
['JUDDI-653', 'JUDDI-664']
['LOGCXX-409', 'LOGCXX-416']


 98%|█████████▊| 2104/2145 [01:19<00:01, 28.94it/s]

['LANG-332', 'LANG-503']


 99%|█████████▊| 2116/2145 [01:19<00:01, 25.35it/s]

['TUSCANY-2783', 'TUSCANY-2784']
['LANG-1040', 'LANG-1252']


 99%|█████████▉| 2124/2145 [01:19<00:00, 27.30it/s]

['HIVE-1534', 'HIVE-1621']
['TS-1489', 'TS-1840']
['NIFI-271', 'NIFI-587']


100%|█████████▉| 2135/2145 [01:20<00:00, 28.89it/s]

['MDEPLOY-57', 'MSHARED-36']
['NIFI-1933', 'NIFI-1934']


100%|██████████| 2145/2145 [01:20<00:00, 26.65it/s]

['SPARK-11628', 'SPARK-9685']
['HDFS-14796', 'HDFS-14797']
['NETBEANS-2494', 'NETBEANS-2978']
Left with 243295 links after removing issue-pairs with multiple link types between them





In [38]:
cl_link_df.issues.value_counts()

['HADOOP-12855', 'HADOOP-12946']     2
['CAMEL-9444', 'CAMEL-9573']         2
['MESOS-2299', 'MESOS-2301']         2
['AVRO-1965', 'HIVE-15316']          2
['HADOOP-9008', 'MAPREDUCE-4780']    2
                                    ..
['FLINK-14574', 'FLINK-15216']       1
['SENTRY-1869', 'SENTRY-872']        1
['ISIS-188', 'ISIS-52']              1
['SPARK-20073', 'SPARK-6231']        1
['FOP-1891', 'FOP-2745']             1
Name: issues, Length: 242823, dtype: int64

In [40]:
cl_link_df[(cl_link_df['issue_id_1'] == 'HADOOP-12855') | (cl_link_df['issue_id_2'] == 'HADOOP-12855')]

Unnamed: 0,name,linktype,issue_id_1,issue_id_2,issues
288833,HADOOP-12946_HADOOP-12855,Reference,HADOOP-12946,HADOOP-12855,"['HADOOP-12855', 'HADOOP-12946']"
288835,HADOOP-12855_HADOOP-12946,Reference,HADOOP-12855,HADOOP-12946,"['HADOOP-12855', 'HADOOP-12946']"
289007,HADOOP-12908_HADOOP-12855,Reference,HADOOP-12908,HADOOP-12855,"['HADOOP-12855', 'HADOOP-12908']"


In [41]:
#Multiple links complete remove
cl_link_df.drop_duplicates(subset=['issues'], inplace=True)
print(f'Left with {len(cl_link_df)} links after removing issue-pairs with the same link type with multiple entries')

cl_link_df.reset_index(inplace=True, drop=True)

Left with 242823 links after removing issue-pairs with the same link type with multiple entries
