# Scan Repositories

This notebook will scan repositories in order to collect details about `wontfix` label usage

In [35]:
import csv

import pandas as pd


In [36]:
df = pd.read_csv('../data/repos.csv', index_col='id')

# Convert these strings of semicolon-seperated values into a proper array
df['labels'] = df['labels'].str.split(';')
df['topics'] = df['topics'].apply(lambda x: [] if pd.isna(x) else x.split(';'))

df.head()

Unnamed: 0_level_0,name,isFork,commits,branches,releases,forks,mainLanguage,defaultBranch,license,homepage,...,isDisabled,isLocked,languages,labels,topics,wontfixType,updatedIssuesTotal,updatedPullsTotal,wontfixIssues,wontfixPulls
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2049,rzwitserloot/lombok,False,3243,3,3,1932,Java,master,Other,https://projectlombok.org/,...,False,False,"{""Java"":4263238,""HTML"":236155,""JavaScript"":158...","[accepted, android, aop, awaiting-fix-confirma...",[],wontfix,3112,518,146,0
29945,android-async-http/android-async-http,False,899,4,3,4194,Java,master,Apache License 2.0,https://github.com/android-async-http/android-...,...,False,False,"{""Java"":311001}","[critical, documentation, duplicate, feature-r...",[],wontfix,1091,287,90,5
49010,eclipse-vertx/vert.x,False,6050,58,0,2032,Java,master,Other,http://vertx.io,...,False,False,"{""Java"":6129111,""HTML"":203}","[blocker, bug, clustering, critical, deferred,...","[concurrency, event-loop, high-performance, ht...",wontfix,2683,2471,61,16


## Fetch from GitHub

Now we can go through each repo and fetch `wontfix` issues from GitHub

In [37]:
import requests, os

GITHUB_TOKEN = os.getenv('GITHUB_TOKEN')

def fetch_issues_for_repo(repo_name: str, label: str = 'wontfix', per_page: int = 100, start_page: int = 1, issue_state: str = 'all') -> list:
    """
    Fetches issues from GitHub API for a specific repo with a given label. Paginates through requests until all issues have been collected. Returns a list of
    all matched issues. 
    :param repo_name: 
    :param label: 
    :param per_page: 
    :param start_page: 
    :param issue_state: 
    :return: list of all_issues
    """
    headers = {
        'Authorization': GITHUB_TOKEN,
        'Accept': 'application/vnd.github.html+json'
    }

    current_page = start_page
    repo_name = repo_name
    label = label
    all_issues = []

    # GitHub API URL for fetching issues from the repository
    issues_url = f'https://api.github.com/repos/{repo_name}/issues'

    while True:
        params = {
            'labels': label,
            'state': issue_state,  # Change to 'open' if you only want open issues
            'per_page': per_page,  # Adjust per_page to your needs, maximum is 100
            'page': current_page
        }
        
        # Send API Request
        response = requests.get(issues_url, headers=headers, params=params)

        # Check for successful response
        if not response.ok:
            print('Failed to fetch issues:', response.content)
            break
        
        issues = response.json()
        all_issues += issues
        
        link_header = response.headers['Link'] if 'Link' in response.headers else ""
        current_page += 1
        if 'rel="next"' not in link_header:
            break # End of pages
            
    return all_issues

In [38]:
from typing import Tuple, List

def separate_issues_from_prs(items: list) -> Tuple[List[dict], List[dict]]:
    """
    Separate a returned list of issues into a list of Pull Requests (PRs) and "proper" Issues (which are not PRs)
    Technically all PRs are issues, so the GitHub API returns PRs as an issue, and that is what necessitates this step.
    :param items: 
    :return: (PRs, Issues)
    """
    prs = []
    issues = []
    for item in items:
        if 'pull_request' in item:
            prs.append(item)
        else:
            issues.append(item)
    return issues, prs


In [39]:
results = fetch_issues_for_repo('jacurtis/wontfix-research', per_page=100)
res_issues, res_prs = separate_issues_from_prs(results)
        
print(f"Issues: {len(res_issues)}")
print(f"PRs: {len(res_prs)}")

Issues: 0
PRs: 0


### Steps

1. Loop through each repo in master list (repos.csv)
2. Get issues for each repo 
    - Use: `fetch_issues_for_repo()`
3. Separate issues from PRs
    - Use: `separate_issues_from_prs()`
4. Record the total issues and wontfix issues on repos.csv
    - Use: `len(issues)` to get wontfix issues
    - Use: `fetch_total_issues_and_prs_for_repo()`
5. Record the total PRs and wontfix PRs on the repos.csv
    - Use: `len(pulls)` to get wonfix pull request total
    - Use results from 4b above
6. Record each issue onto the issues.csv
7. Record each PR onto the pull-requests.csv
8. Check if loop should continue, repeating as necessary

In [40]:
def extract_total_count(link_header: str) -> int:
    """
    Extracts the total number of pages from a link header on a GitHub API response.
    If you only use per_page=1, then this can easily map to the total number of issues/prs with a single request
    :param link_header: 
    :return total_count: 
    """
    if link_header:
        parts = link_header.split(',')
        for part in parts:
            if 'rel="last"' in part:
                return int(part.split("&page=")[1].split(">")[0])
    return 0

def fetch_total_issues_and_prs_for_repo(repo_name: str) -> (int, int):
    """
    Fetches the total number of issues and pull requests for a given repository
    :param repo_name: 
    :return total_issues, total_pulls:
    """
    issues_url = f'https://api.github.com/repos/{repo_name}/issues'
    pulls_url = f'https://api.github.com/repos/{repo_name}/pulls'
    
    headers = {
        'Authorization': GITHUB_TOKEN,
        'Accept': 'application/vnd.github+json'
    }
    
    params = {
        'state': 'all',
        'per_page': 1
    }
    
    response_pulls = requests.get(pulls_url, headers=headers, params=params)
    response_issues = requests.get(issues_url, headers=headers, params=params)
    
    header_pulls_link = response_pulls.headers.get('Link', None)
    header_issues_link = response_issues.headers.get('Link', None)
    
    total_pulls = extract_total_count(header_pulls_link)
    total_issues = extract_total_count(header_issues_link) - total_pulls
    
    return total_issues, total_pulls

In [41]:
fetch_total_issues_and_prs_for_repo('rzwitserloot/lombok')

(3112, 518)

In [42]:
def update_repos(start_line: int, end_line: int, source_filename: str = '../data/raw/wontfix.csv', dest_filename: str = '../data/repos.csv', wontfix_type: str = 'wontfix') -> None:
    """
    Given a start line and an end line, go down the repos.csv file taking the repo from each line and
    updating the pull/issue totals and extracting all the issues for that repository.
    Note: Line numbers are zero-indexed, so line 1 is the 2nd line in the file. This works because the first line is the header.
    :param start_line: 
    :param end_line: 
    :param source_filename: 
    :param dest_filename: 
    :param wontfix_type: 
    :return None: 
    """
    with open(source_filename, 'r') as source_file:
        source_reader = csv.reader(source_file)
        with open(dest_filename, 'a') as dest_file:
            dest_writer = csv.writer(dest_file)
            for i, row in enumerate(source_reader):
                if start_line <= i <= end_line:
                    repo_name = row[1] # Repo name is in the second column
                    row.append(wontfix_type) # static value to indicate the text used for the wontfix label in this repo
                    
                    # Get the total issues and PRs for the repo
                    total_issues, total_pulls = fetch_total_issues_and_prs_for_repo(repo_name)
                    row.append(total_issues)
                    row.append(total_pulls)
                    
                    # Get the issues for the repo
                    results = fetch_issues_for_repo(repo_name, label=wontfix_type, per_page=100)
                    res_issues, res_prs = separate_issues_from_prs(results)
                    row.append(len(res_issues))
                    row.append(len(res_prs))
                    
                    # Write the new row to the file
                    dest_writer.writerow(row)

                    # Write the issues and PRs to separate files
                    # TODO: Write the issues to a file
                    # TODO: Write the PRs to a file
                
# update_repos(1,3)