### Retrieve repo info from the crawler config file

In [None]:
CONFIG_FILE = 'icse_repos.config'

def get_config_repos():
    config_repos = [ ]
    with open(CONFIG_FILE, 'r') as config_file:
        for line in config_file:
            path, key, tag, url = line.split()
            config_repos.append({
                'path': path,
                'key': key,
                'tag': tag,
                'url': url
            })
    return config_repos

def writeout_config(config_repos):
    with open(CONFIG_FILE, 'w') as config_file:
        for repo in config_repos:
            print('%s\t%s\t%s\t%s' %
                  (repo['path'], repo['key'], repo['tag'], repo['url']),
                  file=config_file)

### Update tags of the config file to a (good) guess in batch
**NOTE: Reserve the original config file before executing this cell!**

In [None]:
import re

config_repos = get_config_repos()

with open('crawl.errors', 'r') as errors:
    for line in errors:
        m = re.match('../../dataset/icse/.+', line)
        if not m: continue
        name = m.group()
        for repo in config_repos:
            if repo['path'] == name:
                repo['tag'] = 'trunk'
                continue

writeout_config(config_repos)

## Check mistach between git repos and issue dirs

In [None]:
import os

PATH = '../../dataset/icse'

repos = set()
issues_dirs = set()

for d in os.listdir(PATH):
    if not os.path.isdir(os.path.join(PATH, d)):
        continue
    if d.endswith('-issues'):
        issues_dirs.add(d[:-len('-issues')])
    else:
        repos.add(d)

print('# git repos = %d\t# issues dirs = %d' %
      (len(repos), len(issues_dirs)))
print('Git repos without issues:', repos - issues_dirs)
print('Issues without git repos:', issues_dirs - repos)

## Stats the number of issues for each git repo
*Run the above cell first to construct the issues dir set.*

In [None]:
from sh.contrib import git
import os
import re

dir_issue_count = { }

def count_issues(dir_name):
    issues = set()
    for f in os.listdir(os.path.join(PATH, dir_name + '-issues')):
        match = pattern.match(f)
        if match:
            issues.add(match.group(1))
    return len(issues)

def count_commits(dir_name):
    repo_dir = os.path.join(PATH, dir_name)
    repo_dir = os.path.expanduser(repo_dir)
    git_repo = git.bake("-C", repo_dir)
    return sum(1 for line in git_repo.log('--oneline'))

pattern = re.compile(r'[\dabcdef]+-([A-Z\d]+-[\d]+)\.xml')

for d in repos:
    n_issues = count_issues(d)
    n_commits = count_commits(d)
    if n_issues < 100 and n_commits > 1000 and n_issues * 10 < n_commits:
        print('Warning: missing issues? %s: %d/%d' %
              (d, n_issues, n_commits))
        continue
    dir_issue_count[d] = n_issues

dir_list = [ ]

total = 0
for d, n in sorted(dir_issue_count.items(), key=lambda x: x[1], reverse=True):
    total += n
    dir_list.append([d, n])
    print(d, n, total)

### DANGEROUS: Remove small projects

In [None]:
import os
import shutil

def remove(name):
    repo_path = os.path.join(PATH, name)
    config_repos = [x for x in get_config_repos() if x['path'] != repo_path]
    writeout_config(config_repos)
    shutil.rmtree(repo_path)
    shutil.rmtree(os.path.join(PATH, name + '-issues'))

In [None]:
remove('manifoldcf-integration-solr-3.x')

In [None]:
THRESHOLD = 100 # Projects with less issues shall be removed

for d, n in dir_issue_count.items():
    if n < THRESHOLD:
        remove(d)

### Check no selected projects are omitted from above removal

In [None]:
import sys
sys.path.append('../')
from jira_stats.process_stats import get_issue_stats

ISSUE_STATS_FILE = '../jira_stats/selected_projects.csv'

issue_stats = get_issue_stats(ISSUE_STATS_FILE)
pre_keys = set()
for project in issue_stats:
    pre_keys.add(project['key'])

post_keys = set()
for repo in get_config_repos():
    post_keys.add(repo['key'])

print(pre_keys - post_keys)