### Caleb Klinger
### Grand Canyon University
### CST-440
### Fabio Marcos De Abreu Santos
### Mar 10, 2024


## GitHub Pull Request Miner

### Overview

This Python application mines a GitHub repository to retrieve information on closed pull requests (PRs), including PR number, issue number, file names attached to the PR, date committed, author, title, body, and comments. It also attempts to predict who will close a PR based on the assumption that the last commenter is the person who closes it. The data is saved into a CSV file for further analysis.
Prerequisites

- Python 3.x installed on your machine
- pip for installing Python packages
- A GitHub account and a Personal Access Token (PAT) with at least repo scope permissions for accessing the GitHub API

In [16]:
import logging

# Setup logging configuration
logging.basicConfig(filename='app_activity.log',
                    filemode='a',
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)

logging.info('Application start')


In [17]:
import csv
import requests

# GitHub API setup
GITHUB_TOKEN = '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'
GITHUB_REPO = 'lencx/ChatGPT'  # Format: 'owner/repository_name'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}
PR_ENDPOINT = f'https://api.github.com/repos/{GITHUB_REPO}/pulls'
PARAMS = {'state': 'closed', 'per_page': 100}  # Fetch 100 PRs per API call

def fetch_prs():
    prs = []
    page = 1
    while len(prs) < 1000:
        logging.info(f'Fetching PRs page {page}')
        response = requests.get(PR_ENDPOINT, headers=HEADERS, params={**PARAMS, 'page': page})
        if response.status_code != 200:
            logging.error(f'Failed to fetch PRs, status code: {response.status_code}')
            break  # Stop if we encounter an error
        batch = response.json()
        if not batch:
            logging.info('No more PRs to fetch')
            break  # Stop if there are no more PRs to fetch
        prs.extend(batch)
        logging.debug(f'Fetched {len(batch)} PRs')
        page += 1
    return prs[:1000]  # Return the first 1000 PRs

def fetch_pr_details(pr):
    logging.debug(f'Fetching details for PR #{pr["number"]}')
    # Fetch additional details like comments and attached files
    files_response = requests.get(pr['url'] + '/files', headers=HEADERS)
    comments_response = requests.get(pr['url'] + '/comments', headers=HEADERS)
    
    files = [file['filename'] for file in files_response.json()]
    comments = [comment['body'] for comment in comments_response.json()]
    closed_at = pr.get('closed_at', 'Not Closed')  # Get 'closed_at', or use 'Not Closed' if none
    
    return {
        'PR Number': pr['number'],
        'Issue Number': pr['head']['ref'],  # This might not accurately reflect the issue number; adjust as needed.
        'Files': ', '.join(files),
        'Date Committed': pr['created_at'],
        'Date Closed': closed_at,  # Include the 'Date Closed' information
        'Author': pr['user']['login'],
        'Title': pr['title'],
        'Body': pr['body'],
        'Comments': ' | '.join(comments)  # Concatenate comments, separated by a pipe
    }


def save_to_csv(pr_details):
    keys = pr_details[0].keys()
    with open('pr_details_with_last_commenter.csv', 'w', newline='') as file:
        dict_writer = csv.DictWriter(file, keys)
        dict_writer.writeheader()
        dict_writer.writerows(pr_details)

def main():
    prs = fetch_prs()
    pr_details = [fetch_pr_details(pr) for pr in prs]
    save_to_csv(pr_details)

if __name__ == '__main__':
    main()

In [18]:
logging.info('Application shutdown')


In [19]:
import csv
from datetime import datetime

def load_csv_data(file_path):
    data = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

def find_most_common_author(data):
    authors_count = {}
    for row in data:
        author = row['Author']
        if author in authors_count:
            authors_count[author] += 1
        else:
            authors_count[author] = 1
    
    most_common_author = max(authors_count, key=authors_count.get)
    return most_common_author, authors_count[most_common_author]


In [20]:
def calculate_average_time_to_close(data):
    total_time = 0
    count = 0
    for row in data:
        try:
            created_at = datetime.strptime(row['Date Committed'], "%Y-%m-%dT%H:%M:%SZ")
            closed_at = datetime.strptime(row['Date Closed'], "%Y-%m-%dT%H:%M:%SZ")  # Adjusted to the correct column name
            total_time += (closed_at - created_at).total_seconds()
            count += 1
        except KeyError as e:
            print(f"Missing data for row: {e}")
        except ValueError as e:
            print(f"Error parsing dates for row: {e}")

    if count == 0:
        return 0
    average_time_seconds = total_time / count
    return average_time_seconds / 3600  # Convert to hours for readability


In [21]:
file_path = '/home/lettuce/SchoolCode/Analytics for Dynamic Social Network (440)/week 12/pr_details_with_last_commenter.csv'
data = load_csv_data(file_path)
most_common_author, count = find_most_common_author(data)
average_time_to_close = calculate_average_time_to_close(data)

from collections import defaultdict

def pr_distribution_over_time(data):
    monthly_pr_count = defaultdict(int)
    for row in data:
        date = datetime.strptime(row['Date Committed'], "%Y-%m-%dT%H:%M:%SZ")
        month_key = date.strftime('%Y-%m')  # Group by year and month
        monthly_pr_count[month_key] += 1
    return dict(monthly_pr_count)

# Assuming you've already loaded the data with load_csv_data
pr_monthly_distribution = pr_distribution_over_time(data)
for month, count in sorted(pr_monthly_distribution.items()):
    print(f"{month}: {count} PRs")


print(f"Most common PR author: {most_common_author} with {count} PRs")
print(f"Average time to close a PR: {average_time_to_close} hours")


2022-12: 50 PRs
2023-01: 31 PRs
2023-02: 30 PRs
2023-03: 11 PRs
2023-04: 3 PRs
2023-05: 3 PRs
2023-06: 1 PRs
2023-07: 4 PRs
2023-09: 1 PRs
2023-10: 2 PRs
2023-12: 1 PRs
2024-01: 1 PRs
2024-03: 2 PRs
Most common PR author: lencx with 2 PRs
Average time to close a PR: 79.61062698412698 hours


### Links
https://github.com/lencx/ChatGPT 
