# Repo Health Visualization Tool

In [None]:
user_name = "vaadin" # Change to user name of your GitHub repository
repository_name = "flow" # Change to repository name of your GitHub repository

input_json_file = r'' # Add path to the .json file containing all issue and pull request data here

In [None]:
# In which formats to export plot data
EXPORT_FORMAT = "both" # "csv", "json" or "both"
# If plots created using Matplotlib should be exported as .pngs
EXPORT_PNGS = True # True or False

## Imports

In [None]:
from github import Github, RateLimitExceededException # GNU licence - LGPL-3.0 and GPL-3.0 () # conda install PyGithub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import re
import json
import time
from datetime import datetime
import os
from dotenv import load_dotenv

from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel
from transformers import AutoModel # What does this do?
from functools import partial
import sklearn 
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from langdetect import detect
# import nltk

In [None]:
# import torch

# # Check if CUDA is available
# cuda_available = torch.cuda.is_available()
# print(f"CUDA Available: {cuda_available}")

# # Get the name of the GPU
# if cuda_available:
#     gpu_name = torch.cuda.get_device_name(0)
#     print(f"GPU: {gpu_name}")

### Notes

The microlevel metrics focus on the individual system components (programs, modules, and procedures) and require a detailed knowledge of their internal mechanisms. Examples of microlevel metrics are McCabe’s cyclomatic complexity metric [5] and Halstead’s software science measurements [6]. Macrolevel metrics, on the other hand, focus on the interconnection of the system components. Examples of these metrics include Myers qualitative coupling evaluation [4], Yin and Winchester’s interlevel measurements [7], the entropy measures of Channon [8], and the more recent technique of Henry and Kafura based on information flow [9,10].

## Varibales

In [None]:
# input_pat_file = r"D:\Code\Github\private\pat.txt"
# input_pat_file = r"D:\Github\private\pat.txt"

# input_json_file = r'D:\Code\Github\SmartDelta\repo_data.json'
# input_json_file = r'D:\Code\Github\SmartDelta\repo_data_partial_20240607_103551.json'
# input_json_file = r'D:\Code\Github\SmartDelta\repo_data_partial_20240611_095503.json'
# input_json_file = r'D:\Code\Github\SmartDelta\repo_data_partial_20240702_111311.json'

# all files except the one below have reactions
# input_json_file = r'D:\Code\Github\SmartDelta\repo_data_partial_20240702_131051.json'
# input_json_file = r'D:\Github\Backups\SmartDelta\repo_data_partial_20240702_131051.json'
# input_json_file = r'D:\GitHub\SmartDelta\data\vaadin_flow_data.json'


# input_json_file = r'D:\Github\SmartDelta\repo_data.json'
# input_json_file = r'D:\GitHub\SmartDelta\repo_data_partial_20240612_090858.json'

# user_name = "vaadin"
# repository_name = "flow"

NUMBER = "number"

# Initialize data structures
issues_data = []
pull_requests_data = []
issue_pr_map = {}

amount_of_issues = 0
amount_of_pull_requests = 0

### Connect to Github API with a PAT

In [None]:
# Load environment variables from the .env file
load_dotenv()

# Get GITHUB_PAT from .env file
GITHUB_PAT = os.getenv("GITHUB_PAT", "")

In [None]:
# with open(input_pat_file) as file:
#     GITHUB_PAT = file.read().strip()
github_api = Github(GITHUB_PAT)

repo = github_api.get_repo(f"{user_name}/{repository_name}")

repo

## Methods to fetch issue and pull request data from Github repo

In [None]:
# Save all data to a .json file
def save_data(partial=False):
    filename = 'repo_data.json'
    if partial:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f'repo_data_partial_{timestamp}.json'
    data = {
        "issues": issues_data,
        "pull_requests": pull_requests_data,
        "issue_pr_map": issue_pr_map
    }
    with open(filename, 'w') as output_file:
        json.dump(data, output_file, indent=4)
    print(f"Data saved to {filename}")

# Map issues to PRs and save the data to issue_pr_map
def map_issues_to_prs():
    for pr in pull_requests_data:
        if pr['title']:
            issue_numbers = re.findall(r'#(\d+)', pr['title'])
            for issue_number in issue_numbers:
                if issue_number not in issue_pr_map: # Create a new list if not present
                    issue_pr_map[issue_number] = []
                issue_pr_map[issue_number].append(pr['number'])
        if pr['description']:
            issue_numbers = re.findall(r'#(\d+)', pr['description'])
            for issue_number in issue_numbers:
                if issue_number not in issue_pr_map: # Create a new list if not present
                    issue_pr_map[issue_number] = []
                issue_pr_map[issue_number].append(pr['number'])


# # Fetch reactions count for an object
# def get_reactions_count(obj):
#     try:
#         reactions = obj.get_reactions()
#         return sum(1 for _ in reactions)
#     except Exception as e:
#         print(f"Error fetching reactions for {obj}: {e}")
#         return -1

# Fetch all issues
def fetch_issues(issues_data = []):
    # print(len(issues_data))
    # return
    # Fetch all issues
    try:
        issues = repo.get_issues(state="all")
        # issues = repo.get_issues(state="all").reversed # TODO: TEST
        print(len(list(issues)))
        amount_of_issues = len(list(issues)) # TODO: TEST
        print(amount_of_issues)
        latest_issue = 0
        # try:
        #     issues_data
        # except NameError:
        #     issues_data = []
        #     print("issues_data wasn't defined when fetch_issues() got called")
        # else:
        #     issues_data = []
        #     print("issues_data wasn't defined when fetch_issues() got called")

        if len(issues_data) < 1: pass # No issue data found
        elif isinstance(issues_data, dict): # Dict of dicts
            latest_issue = max(issue[NUMBER] for issue in issues_data.values())
        elif isinstance(issues_data, list): # List of dicts
            latest_issue = max(issue[NUMBER] for issue in issues_data)
        else:
            raise TypeError("issues_data must be either a dict or a list.")
            
        # print(f"Amount of issues: {amount_of_issues} | Issue Counter: {issue_counter}")
        # print(f"Should fetch {amount_of_issues - issue_counter} issues\n")

        print(f"Amount of issues: {amount_of_issues} | Length of issue data: {len(issues_data)} | Latest issue: {latest_issue}")
        print(f"Should fetch approximately {amount_of_issues - len(issues_data)} issues\n")
    except RateLimitExceededException as e:
        print(f"\n{e}\n")
        rate_limit_exceed_handling()
    
    try:
        for i, issue in enumerate(issues):
            # if(i < issue_counter): continue # Skip already processed issues BUG: New issues seem to be added to the start of the list not the end
            # if(amount_of_issues - i < issue_counter): continue # Skip already processed issues TODO: TEST!
            if issue.number < latest_issue: continue
        
            issues_data.append({
                "number": issue.number,
                "title": issue.title,
                "state": issue.state,
                "created_at": issue.created_at.isoformat(),
                "closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
                "user": issue.user.login,
                "comments": issue.comments,
                "description": issue.body,
                "description_length": len(issue.body) if issue.body else 0,
                "labels": [label.name for label in issue.labels],
                "issue_author_association": issue.raw_data.get("author_association", "NONE")#,
                # "reactions": get_reactions_count(issue)
            })
    except RateLimitExceededException as e:
        print(f"\n{e}\n")
        rate_limit_exceed_handling()
        fetch_issues()

# Fetch all pull requests
def fetch_pull_requests(pull_requests_data = []):
    # Fetch all pull requests
    try:
        prs = repo.get_pulls(state="all")
        # prs = repo.get_pulls(state="all").reversed # TODO: TEST
        amount_of_pull_requests = len(list(prs)) # TODO: TEST

        latest_pr = 0
        # try:
        #     pull_requests_data
        # except NameError:
        #     pull_requests_data = []
        #     print("pull_requests_data wasn't defined when fetch_pull_requests() got called")
        # else:
        #     pull_requests_data = []
        #     print("pull_requests_data wasn't defined when fetch_pull_requests() got called")

        if len(pull_requests_data) < 1: pass # No issue data found
        elif isinstance(pull_requests_data, dict): # Dict of dicts
            latest_pr = max(issue[NUMBER] for issue in pull_requests_data.values())
        elif isinstance(pull_requests_data, list): # List of dicts
            latest_pr = max(issue[NUMBER] for issue in pull_requests_data)
        else:
            raise TypeError("pull_requests_data must be either a dict or a list.")

        # print(f"Amount of pull requests: {amount_of_pull_requests} | PR Counter: {pr_counter}")
        # print(f"Should fetch {amount_of_pull_requests - pr_counter} PRs\n")
        print(f"Amount of pull requests: {amount_of_pull_requests} | Length of pull requests: {len(pull_request_data)} | Latest pull request: {latest_pr}")
        print(f"Should fetch approximately {amount_of_pull_requests - len(pull_request_data)} pull requests\n")
    except RateLimitExceededException as e:
        print(f"\n{e}\n")
        rate_limit_exceed_handling()
    try:
        for i, pr in enumerate(prs):
            # if(i < pr_counter): continue # Skip already processed PRs
            # if(amount_of_pull_requests - i < pr_counter): continue # Skip already processed PRs TODO: TEST!
            if pr.number < latest_pr: continue

            pr_files = pr.get_files()
            files_changed = [{"filename": f.filename, "additions": f.additions, "deletions": f.deletions} for f in pr_files]
            pr_commits = pr.get_commits()
            commit_data = [{
                "sha": commit.sha,
                "author": commit.commit.author.name,
                "date": commit.commit.author.date.isoformat(),
                "message": commit.commit.message
            } for commit in pr_commits]
            pull_requests_data.append({
                "number": pr.number,
                "title": pr.title,
                "state": pr.state,
                "created_at": pr.created_at.isoformat(),
                "merged_at": pr.merged_at.isoformat() if pr.merged_at else None,
                "closed_at": pr.closed_at.isoformat() if pr.closed_at else None,
                "user": pr.user.login,
                "comments": pr.comments,
                "review_comments": pr.review_comments,
                "description": pr.body,
                "description_length": len(pr.body) if pr.body else 0,
                "additions": pr.additions,
                "deletions": pr.deletions,
                "changed_files": pr.changed_files,
                "files": files_changed,
                "commits": commit_data,
                "labels": [label.name for label in pr.labels],
                "mergeable_state": pr.mergeable_state
            })
    except RateLimitExceededException as e:
        print(f"\n{e}\n")
        rate_limit_exceed_handling()
        fetch_pull_requests(i)

# handle rate limit exceed exceptions
# Alternative solution would be to keep track of rate limits: https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api
def rate_limit_exceed_handling():
    print("Rate limit exceeded again. Saving partial data...")
    save_data(partial=True)
    print("Partial data saved. Waiting until reset...")
    reset_time = github_api.rate_limiting_resettime
    sleep_time = max(0, reset_time - time.time() + 10)  # Adding a buffer of 10 seconds
    print(f"Sleeping for {sleep_time:.0f} seconds")
    time.sleep(sleep_time)
    # Retry after sleep
    print("\nRetrying...")

#### Check how much longer to wait before next request

In [None]:
reset_time = github_api.rate_limiting_resettime
sleep_time = max(0, reset_time - time.time() + 10)  # Adding a buffer of 10 seconds
print(f"Sleeping for {sleep_time:.0f} seconds")

In [None]:
from collections import defaultdict

def remove_duplicates(data, data_type):
    if(data_type == "issue"):
        if(len(data) == amount_of_issues):
            print("No duplicates to remove")
            return
        
    if(data_type == "pull request"):
        if(len(data) == amount_of_pull_requests):
            print("No duplicates to remove")
            return

    print(f"Number of {data_type}s before duplicate removal: {len(data)}")
    print(f"Removing duplicates {data_type}s ...")
    unique_entries = list(set([entry['number'] for entry in data]))
    print(f"Number of unique {data_type}s: {len(unique_entries)}")
    print(f"Number of duplicate {data_type}s: {len(data) - len(unique_entries)}")

    # Track issue numbers and their occurrences
    data_tracker = defaultdict(list)

    for entry in data_tracker:
        data_tracker[entry['number']].append(entry)

    # Identify duplicates
    duplicates = {number: entry_data for number, entry_data in data_tracker.items() if len(entry_data) > 1}

    # Print duplicates
    print(f"Duplicates: {list(duplicates.keys())}")

    # If there are duplicates, print the detailed information of these issues
    if duplicates:
        print(f"\nDetails of duplicate {data_type}s:")
        for number, data_entries in duplicates.items():
            print(f"{data_type[0].upper()}{data_type[1:]} number: {number}")
            for entry in data_entries:
                print(entry) # Print the detailed information of the issue - only thing that was different so far was the reactions value for issues
                # TODO: only keep 1 of them
                del entry

    print(f"Done removing duplicate {data_type}s. New number of {data_type}: {len(data)}")

## Load data from .json

In [None]:
try:
    with open(input_json_file) as input_file:
        data = json.load(input_file)
        issues_data = data['issues']
        pull_requests_data = data['pull_requests']
        issue_pr_map = data['issue_pr_map']
except FileNotFoundError:
    print("No existing data file found. Fetching data from scratch. This can be very time consuming.\n")

    print("\nFetching issues ...")
    fetch_issues(issues_data)
    print("Done fetching issues")
    remove_duplicates(issues_data, "issue")

    print("\nNow fetching pull requests ...")
    fetch_pull_requests(pull_requests_data)
    print("Done fetching pull requests")
    remove_duplicates(pull_requests_data, "pull request")

    # Map issues to pull requests
    print("\nMapping issues to pull requests...")
    map_issues_to_prs()
    print("Done mapping issues to pull requests")

    # Save final data if no exception occurred
    # save_data()
    save_data(partial=True)

    # # Don't run with partial = False until fully tested that everything works corectly
    # print("Exporting data...")
    # save_data(partial=True)

## Fetch all data on issues and pull requests from Github Repo

In [None]:
# if not input_json_file:
#     # lines below fetch all data - Don't run this unless you want to fetch new data (last time it took 740min 59.7s) - better way is to only fetch new data, which should be accomplished by adding the leength of the issues and PR data since the order of those should always be the same (at least it was the last time) - at least thats what I thought ... but when testing it it did fetch some issues that were already fetched ... since my duplicate cleanup lower, reduced the number of issues from 19072 back to 19049 (the original amount on the day I first completed fetching the data ... as of today there are 19072 issues, but the new ones didnt get fetched, so I will have to investigate why that is the case under the assumption that I can't rely on the order for issues and PRs staying the same -> TODO: extract a list with all issue numbers/ PR numbers and then only fetch the ones that are not in the list yet) Also please tell me that new issues and PRs are added to the beginning opposed to the end, since that would mean that my code should never have worked properly

#     print("\nFetching issues ...")
#     fetch_issues(issues_data)
#     print("Done fetching issues")
#     remove_duplicates(issues_data, "issue")


#     print("\nNow fetching pull requests ...")
#     fetch_pull_requests(pull_requests_data)
#     print("Done fetching pull requests")
#     remove_duplicates(pull_requests_data, "pull request")

#     # Map issues to pull requests
#     print("\nMapping issues to pull requests...")
#     map_issues_to_prs()
#     print("Done mapping issues to pull requests")

#     # Save final data if no exception occurred
#     # save_data()
#     save_data(partial=True)

#     # # Don't run with partial = False until fully tested that everything works corectly
#     # print("Exporting data...")
#     # save_data(partial=True)

In [None]:
len(issues_data), len(pull_requests_data), len(issue_pr_map)

##### CARE: Not well tested!

In [None]:
# try:
#     fetch_issues()
# except Exception as e:
#     print(f'Exception "{e}" occurred while fetching issues. Saving partial data...')
#     save_data(partial=True)
#     raise

# try:
#     fetch_pull_requests()
# except Exception as e:
#     print(f'Exception "{e}" occurred while fetching pull requests. Saving partial data...')
#     save_data(partial=True)
#     raise

## Remove duplicate issues

In [None]:
from collections import defaultdict

# print(f"Number of issues: {len(data['issues'])}")
print(f"Number of issues: {len(issues_data)}")

# Extract all issue numbers from the dict and then remove all dupllicates by making it a set and then converting it back to a list
# issue_numbers = list(set([issue['number'] for issue in data['issues']]))
issue_numbers = list(set([issue['number'] for issue in issues_data]))
print(f"Number of unique issues: {len(issue_numbers)}")
print(f"Number of duplicate issues: {len(issues_data) - len(issue_numbers)}")

# Track issue numbers and their occurrences
issue_tracker = defaultdict(list)

for issue in issues_data:
    issue_tracker[issue['number']].append(issue)

# Identify duplicates
duplicates = {number: issues for number, issues in issue_tracker.items() if len(issues) > 1}

# Print duplicates
print(f"Duplicates: {list(duplicates.keys())}")

# If there are duplicates, print the detailed information of these issues
if duplicates:
    print("\nDetails of duplicate issues:")
    for number, issues in duplicates.items():
        print(f"Issue number: {number}")
        for issue in issues:
            print(issue)

# Clean the duplicates under the assumption that just like the last time only the value for reactions differs
cleaned_issues_data = []
for number, issues in issue_tracker.items():
    if len(issues) > 1:
        duplicates[number] = issues
        # Keep the issue with the highest 'reactions' value
        best_issue = max(issues, key=lambda x: x['reactions'])
        cleaned_issues_data.append(best_issue)
    else:
        cleaned_issues_data.append(issues[0])

print(f"\nNumber of cleaned issues: {len(cleaned_issues_data)}")

# print the cleaned issues data (all entries, whichs number is in the duplicates dict)
print("\nCleaned duplicate issues:")
for issue in cleaned_issues_data:
    if issue['number'] in duplicates:
        print(issue)

issue_numbers = [issue['number'] for issue in cleaned_issues_data]
print(f"\nNumber of issues: {len(cleaned_issues_data)}")
print(f"Number of unique issues: {len(set(issue_numbers))}")

print(f"Smallest issue number: {min(issue_numbers)}")
print(f"Biggest issue number: {max(issue_numbers)}")
difference = max(issue_numbers) - min(issue_numbers)
print(f"Difference between biggest and smallest issue: {difference}")
print(f"Number of unused issue numbers: {difference - len(issue_numbers) + 1}")

issues_data = cleaned_issues_data

### Map issues to pull requests

In [None]:
# Map issues to pull requests
print("Mapping issues to pull requests...")
map_issues_to_prs()
print("Done mapping issues to pull requests")

In [None]:
# # Save final data if no exception occurred
# save_data()

# Don't run with partial = False until fully tested that everything works corectly
# print("Exporting data...")
# save_data(partial=True)

#### Pull request test

In [None]:
# # Fetch a single pull request for testing
# pr_number = 19489
# try:
#     pr = repo.get_pull(pr_number)
#     issue = repo.get_issue(pr_number)  # Fetch the corresponding issue for reactions
#     pr_files = pr.get_files()
#     files_changed = [{"filename": f.filename, "additions": f.additions, "deletions": f.deletions} for f in pr_files]
#     pr_commits = pr.get_commits()
#     commit_data = [{
#         "sha": commit.sha,
#         "author": commit.commit.author.name,
#         "date": commit.commit.author.date.isoformat(),
#         "message": commit.commit.message
#     } for commit in pr_commits]
#     pull_requests_data = [{
#         "number": pr.number,
#         "title": pr.title,
#         "state": pr.state,
#         "created_at": pr.created_at.isoformat(),
#         "merged_at": pr.merged_at.isoformat() if pr.merged_at else None,
#         "closed_at": pr.closed_at.isoformat() if pr.closed_at else None,
#         "user": pr.user.login,
#         "comments": pr.comments,
#         "review_comments": pr.review_comments,
#         "description": pr.body,
#         "description_length": len(pr.body) if pr.body else 0,
#         "additions": pr.additions,
#         "deletions": pr.deletions,
#         "changed_files": pr.changed_files,
#         "files": files_changed,
#         "commits": commit_data,
#         "labels": [label.name for label in pr.labels],
#         "mergeable_state": pr.mergeable_state
#     }]
#     print("Pull request data collected successfully.")
# except UnknownObjectException:
#     print(f"Pull request #{pr_number} not found.")
# except RateLimitExceededException:
#     print("Rate limit exceeded. Please wait and try again later.")

## Data Processing

In [None]:
def calculate_time_difference(created_at, closed_at):
    # Define the format of the timestamps
    timestamp_format = "%Y-%m-%dT%H:%M:%S"
    
    # Parse the created_at timestamp
    created_time = datetime.strptime(created_at, timestamp_format)
    
    # Parse the closed_at timestamp if it is not None
    if closed_at is not None:
        closed_time = datetime.strptime(closed_at, timestamp_format)
        # Calculate the difference
        time_difference = (closed_time - created_time).total_seconds() # Calculate the time difference and convert it to seconds
    else:
        time_difference = None  # Or some other placeholder to indicate it's still open
    
    return time_difference

In [None]:
def calculate_turnaround_time(data):

    for entry in data:
        entry['turnaround_time'] = None # Initialize the turnaround time to handle open issues
        if entry['closed_at']: # Only calculate the turnaround time if the entry is closed
            entry['turnaround_time'] = int(calculate_time_difference(entry['created_at'], entry['closed_at']))
        # print(entry['turnaround_time'])

    # Verify all 'turnaround time' values are integers or None
    for entry in data:
        if entry['turnaround_time'] is not None:
            assert isinstance(entry['turnaround_time'], int), f"Turnaround time is not an int: {entry['turnaround_time']}"

calculate_turnaround_time(issues_data)
calculate_turnaround_time(pull_requests_data)

# save_data(partial=True) # Don't want to overwrite the original data file until I am done with data processing and verification

In [None]:
# save_data(partial=True)

# Plotly example below

It isn't working as intended yet, but still can give an idea on how it can be done.
It is currently uncommented!

## Visualization 

## Plots configuration

In [None]:
# import json

# plot_configs = {
#     "issue_turnaround_time": {
#         "function": "plot_turnaround_time",
#         "plot_type": "line",
#         "x_axis": "Creation Date (resampled monthly)",
#         "y_axis": "Average Turnaround Time (days)",
#         "description": "Average turnaround time over time for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_turnaround_time": {
#         "function": "plot_turnaround_time",
#         "plot_type": "line",
#         "x_axis": "Creation Date (resampled monthly)",
#         "y_axis": "Average Turnaround Time (days)",
#         "description": "Average turnaround time over time for Pull Requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_turnaround_time_scatter": {
#         "function": "plot_turnaround_time_scatter",
#         "plot_type": "scatter",
#         "x_axis": "Creation Date",
#         "y_axis": "Turnaround Time (days)",
#         "description": "Scatter plot of turnaround time over time for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_turnaround_time_scatter": {
#         "function": "plot_turnaround_time_scatter",
#         "plot_type": "scatter",
#         "x_axis": "Creation Date",
#         "y_axis": "Turnaround Time (days)",
#         "description": "Scatter plot of turnaround time over time for Pull Requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_open_to_closed_ratio": {
#         "function": "plot_open_to_closed_ratio",
#         "plot_type": "line",
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Open to Closed Ratio",
#         "description": "Ratio of cumulative open to closed issues for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_open_to_closed_ratio": {
#         "function": "plot_open_to_closed_ratio",
#         "plot_type": "line",
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Open to Closed Ratio",
#         "description": "Ratio of cumulative open to closed issues for Pull Requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_open_and_closed": {
#         "function": "plot_open_and_closed",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Cumulative count of Issues",
#         "description": "Cumulative open and closed issues for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "pull request_open_and_closed": {
#         "function": "plot_open_and_closed",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Cumulative count of Pull Requests",
#         "description": "Cumulative open and closed pull requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "issue_open_counts": {
#         "function": "plot_open_counts",
#         "plot_type": "bar",
#         "x_axis": "Creation Date (grouped by month)",
#         "y_axis": "Count of Open Issues",
#         "description": "Monthly count of open issues for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_open_counts": {
#         "function": "plot_open_counts",
#         "plot_type": "bar",
#         "x_axis": "Creation Date (grouped by month)",
#         "y_axis": "Count of Open Pull Requests",
#         "description": "Monthly count of open pull requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "commit_frequency_and_size": {
#         "function": "plot_commit_frequency_and_size",
#         "plot_type": "bar + line dual axis",
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": {
#             "left": "Monthly Commits",
#             "right": "Average Commit Size (lines)"
#         },
#         "description": "Commit frequency and average commit size over time (unfiltered)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "commit_frequency_and_size_filtered": {
#         "function": "plot_commit_frequency_and_size",
#         "plot_type": "bar + line dual axis",
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": {
#             "left": "Monthly Commits",
#             "right": "Average Commit Size (lines)"
#         },
#         "description": "Commit frequency and average commit size over time (filtered)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "requests_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Requests'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "impact_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Impact'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "severity_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Severity'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "code_quality_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Code Quality'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "commit_metrics": {
#         "function": "plot_commit_metrics_with_bugs",
#         "plot_type": "bar + line dual axis with third axis",
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": {
#             "left": "Monthly Commits",
#             "right": "Average Commit Size (lines)"
#         },
#         "description": "Commit metrics (frequency and size) over time for pull requests (commit data)",
#         "export_formats": ["csv/json"],
#         "available_scales": ["linear"]
#     },
#     "bug_metrics": {
#         "function": "plot_commit_metrics_with_bugs",
#         "plot_type": "bar + line dual axis with third axis",
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": "Bug Issues Count",
#         "description": "Monthly bug issues count (from issues with 'bug' label)",
#         "export_formats": ["csv/json"],
#         "available_scales": ["linear"]
#     }
# }

# with open('plot_configs.json', 'w') as f:
#     json.dump(plot_configs, f, indent=4)

# print("Plot configuration saved as plot_configs.json")

In [None]:
# import json

# plot_configs = {
#     "issue_turnaround_time": {
#         "function": "plot_turnaround_time",
#         "plot_type": "line",
#         "x": "created_at",           # column with creation date
#         "y": "turnaround_time",      # column with turnaround time
#         "x_axis": "Creation Date",
#         "y_axis": "Average Turnaround Time (days)",
#         "description": "Average turnaround time over time for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_turnaround_time": {
#         "function": "plot_turnaround_time",
#         "plot_type": "line",
#         "x": "created_at",
#         "y": "turnaround_time",
#         "x_axis": "Creation Date",
#         "y_axis": "Average Turnaround Time (days)",
#         "description": "Average turnaround time over time for Pull Requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_turnaround_time_scatter": {
#         "function": "plot_turnaround_time_scatter",
#         "plot_type": "scatter",
#         # To swap the axes, we now explicitly specify which column goes where:
#         # "x": "turnaround_time",
#         # "y": "created_at",
#         "x": "created_at",
#         "y": "turnaround_time",
#         "x_axis": "Turnaround Time (days)",
#         "y_axis": "Creation Date",
#         "description": "Scatter plot of turnaround time vs. creation date for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_turnaround_time_scatter": {
#         "function": "plot_turnaround_time_scatter",
#         "plot_type": "scatter",
#         # "x": "turnaround_time",
#         # "y": "created_at",
#         "x": "created_at",
#         "y": "turnaround_time",
#         "x_axis": "Turnaround Time (days)",
#         "y_axis": "Creation Date",
#         "description": "Scatter plot of turnaround time vs. creation date for Pull Requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_open_to_closed_ratio": {
#         "function": "plot_open_to_closed_ratio",
#         "plot_type": "line",
#         "x": "time",                   # assuming the resampled data has a 'time' column or use index
#         "y": "open_to_closed_ratio",   # ratio values
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Open to Closed Ratio",
#         "description": "Cumulative ratio of open to closed issues for Issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_open_to_closed_ratio": {
#         "function": "plot_open_to_closed_ratio",
#         "plot_type": "line",
#         "x": "time",
#         "y": "open_to_closed_ratio",
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": "Open to Closed Ratio",
#         "description": "Cumulative ratio of open to closed pull requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "issue_open_and_closed": {
#         "function": "plot_open_and_closed",
#         "plot_type": "line (multiple lines)",
#         "x": "time",  
#         "y": { "left": "cumulative_open", "right": "cumulative_closed" },
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": { "left": "Cumulative Open", "right": "Cumulative Closed" },
#         "description": "Cumulative open and closed issues for Issues (linear scale)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "pull request_open_and_closed": {
#         "function": "plot_open_and_closed",
#         "plot_type": "line (multiple lines)",
#         "x": "time",
#         "y": { "left": "cumulative_open", "right": "cumulative_closed" },
#         "x_axis": "Time (resampled monthly)",
#         "y_axis": { "left": "Cumulative Open", "right": "Cumulative Closed" },
#         "description": "Cumulative open and closed pull requests (linear scale)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "issue_open_counts": {
#         "function": "plot_open_counts",
#         "plot_type": "bar",
#         "x": "created_at",  # after grouping, this is the month column
#         "y": "count",       # the count of open issues per month
#         "x_axis": "Creation Date (grouped by month)",
#         "y_axis": "Count of Open Issues",
#         "description": "Monthly count of open issues",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "pull request_open_counts": {
#         "function": "plot_open_counts",
#         "plot_type": "bar",
#         "x": "created_at",
#         "y": "count",
#         "x_axis": "Creation Date (grouped by month)",
#         "y_axis": "Count of Open Pull Requests",
#         "description": "Monthly count of open pull requests",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "commit_frequency_and_size": {
#         "function": "plot_commit_frequency_and_size",
#         "plot_type": "bar + line dual axis",
#         "x": "date",  # column with the resampled date
#         "y": { "left": "monthly_commits", "right": "avg_size" },
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": { "left": "Monthly Commits", "right": "Average Commit Size (lines)" },
#         "description": "Commit frequency and average commit size over time (unfiltered)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "commit_frequency_and_size_filtered": {
#         "function": "plot_commit_frequency_and_size",
#         "plot_type": "bar + line dual axis",
#         "x": "date",
#         "y": { "left": "monthly_commits", "right": "avg_size" },
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": { "left": "Monthly Commits", "right": "Average Commit Size (lines)" },
#         "description": "Commit frequency and average commit size over time (filtered)",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear", "log"]
#     },
#     "requests_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x": "created_at",  # column with the date
#         "y": "count",       # count per label after grouping
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Requests'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "impact_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x": "created_at",
#         "y": "count",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Impact'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "severity_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x": "created_at",
#         "y": "count",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Severity'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "code_quality_label_data": {
#         "function": "plot_label_group",
#         "plot_type": "line (multiple lines)",
#         "x": "created_at",
#         "y": "count",
#         "x_axis": "Time (grouped by month)",
#         "y_axis": "Count per Label",
#         "description": "Monthly counts for label group 'Code Quality'",
#         "export_formats": ["csv/json", "png"],
#         "available_scales": ["linear"]
#     },
#     "commit_metrics": {
#         "function": "plot_commit_metrics_with_bugs",
#         "plot_type": "bar + line dual axis with third axis",
#         "x": "date",
#         "y": { "left": "monthly_commits", "right": "avg_size" },
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": { "left": "Monthly Commits", "right": "Average Commit Size (lines)" },
#         "description": "Commit metrics (frequency and size) over time for pull requests (commit data)",
#         "export_formats": ["csv/json"],
#         "available_scales": ["linear"]
#     },
#     "bug_metrics": {
#         "function": "plot_commit_metrics_with_bugs",
#         "plot_type": "bar + line dual axis with third axis",
#         "x": "date",
#         "y": "bug_issues",  # single series for bug issues
#         "x_axis": "Date (resampled monthly)",
#         "y_axis": "Bug Issues Count",
#         "description": "Monthly bug issues count (from issues with 'bug' label)",
#         "export_formats": ["csv/json"],
#         "available_scales": ["linear"]
#     }
# }

# with open('plot_configs.json', 'w') as f:
#     json.dump(plot_configs, f, indent=4)

# print("Plot configuration saved as plot_configs.json")

## Visualization using Plotly

In [None]:
# import json
# import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# # Load the adapted configuration dictionary from JSON file
# with open('plot_configs.json', 'r') as f:
#     plot_configs = json.load(f)

# # Helper: load exported data from CSV; fallback to JSON if CSV not found.
# def load_exported_data(key):
#     try:
#         df = pd.read_csv(f"{key}.csv", index_col=0)
#         return df
#     except Exception as e:
#         try:
#             df = pd.read_json(f"{key}.json")
#             return df
#         except Exception as e:
#             print(f"Could not load data for {key}: {e}")
#             return None

# # Iterate over each plot configuration and generate the visualization for each scale option.
# for key, config in plot_configs.items():
#     df = load_exported_data(key)
#     if df is None:
#         continue  # Skip if data not found.
    
#     # Determine which scales to produce; default to "linear" if not specified.
#     scale_options = config.get("available_scales", ["linear"])
    
#     for scale in scale_options:
#         print(f"Visualizing {key} with {scale} scale: {config['description']}")
        
#         # Determine the x-axis data.
#         if config.get("x") and config["x"] in df.columns:
#             x_data = df[config["x"]]
#         else:
#             x_data = df.index  # Fallback to index if "x" key not found in columns.
        
#         # If config["y"] is not a dict, we use a simple single-axis plot.
#         if not isinstance(config.get("y"), dict):
#             if config.get("y") and config["y"] in df.columns:
#                 y_data = df[config["y"]]
#             else:
#                 y_data = df[df.columns[0]]  # Fallback: first column.
            
#             if config["plot_type"].lower() in ["line", "scatter"]:
#                 if config["plot_type"].lower() == "line":
#                     fig = px.line(df, x=x_data, y=y_data,
#                                   title=f"{config['description']} ({scale} scale)",
#                                   labels={"x": config.get("x_axis", "X Axis"),
#                                           "y": config.get("y_axis", "Y Axis")})
#                 else:  # scatter
#                     fig = px.scatter(df, x=x_data, y=y_data,
#                                      title=f"{config['description']} ({scale} scale)",
#                                      labels={"x": config.get("x_axis", "X Axis"),
#                                              "y": config.get("y_axis", "Y Axis")})
#             elif config["plot_type"].lower() == "bar":
#                 fig = px.bar(df, x=x_data, y=y_data,
#                              title=f"{config['description']} ({scale} scale)",
#                              labels={"x": config.get("x_axis", "X Axis"),
#                                      "y": config.get("y_axis", "Y Axis")})
#             elif "multiple lines" in config["plot_type"].lower():
#                 # Plot all columns as separate lines.
#                 fig = px.line(df, x=x_data, y=df.columns,
#                               title=f"{config['description']} ({scale} scale)",
#                               labels={"x": config.get("x_axis", "X Axis"), "y": "Values"})
#             else:
#                 # Default fallback to a line plot.
#                 fig = px.line(df, x=x_data, y=y_data,
#                               title=f"{config['description']} ({scale} scale)",
#                               labels={"x": config.get("x_axis", "X Axis"),
#                                       "y": config.get("y_axis", "Y Axis")})
#             if scale == "log":
#                 fig.update_yaxes(type="log")
        
#         # Dual-axis case: config["y"] is a dict.
#         else:
#             # Create a subplot with secondary y-axis.
#             fig = make_subplots(specs=[[{"secondary_y": True}]])
#             y_left_key = config["y"].get("left")
#             y_right_key = config["y"].get("right")
#             if y_left_key not in df.columns or y_right_key not in df.columns:
#                 print(f"Missing columns for dual-axis plot in {key}")
#                 continue
#             # Left trace: use a bar plot.
#             fig.add_trace(
#                 go.Bar(x=x_data, y=df[y_left_key],
#                        name=config["y_axis"].get("left", "Left Y")),
#                 secondary_y=False
#             )
#             # Right trace: use a line plot.
#             fig.add_trace(
#                 go.Scatter(x=x_data, y=df[y_right_key],
#                            name=config["y_axis"].get("right", "Right Y")),
#                 secondary_y=True
#             )
#             fig.update_layout(title_text=f"{config['description']} ({scale} scale)")
#             fig.update_xaxes(title_text=config.get("x_axis", "X Axis"))
#             fig.update_yaxes(title_text=config["y_axis"].get("left", "Left Y"), secondary_y=False)
#             fig.update_yaxes(title_text=config["y_axis"].get("right", "Right Y"), secondary_y=True, type=scale)
        
#         fig.show()

In [None]:
# import json
# import pandas as pd
# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# # Load the adapted configuration dictionary from JSON file
# with open('plot_configs.json', 'r') as f:
#     plot_configs = json.load(f)

# # Helper: load exported data from CSV; fallback to JSON if CSV not found.
# def load_exported_data(key):
#     try:
#         df = pd.read_csv(f"{key}.csv", index_col=0)
#         return df
#     except Exception as e:
#         try:
#             df = pd.read_json(f"{key}.json")
#             return df
#         except Exception as e:
#             print(f"Could not load data for {key}: {e}")
#             return None

# # Iterate over each plot configuration and generate the visualization for each scale option.
# for key, config in plot_configs.items():
#     df = load_exported_data(key)
#     if df is None:
#         continue  # Skip if data not found.
    
#     # Determine which scales to produce; default to "linear" if not specified.
#     scale_options = config.get("available_scales", ["linear"])
    
#     for scale in scale_options:
#         print(f"Visualizing {key} with {scale} scale: {config['description']}")
        
#         # Determine x-axis: use the column name from config if available; else fallback to index.
#         x_col = config.get("x")
#         if x_col and x_col in df.columns:
#             x_data = x_col  # use column name for Plotly Express
#         else:
#             x_data = None  # fallback to index
        
#         # Single-axis case: config["y"] is not a dict.
#         if not isinstance(config.get("y"), dict):
#             y_col = config.get("y")
#             if y_col and y_col in df.columns:
#                 y_data = y_col  # use column name
#             else:
#                 y_data = df.columns[0]  # fallback: first column
            
#             # For Plotly Express, if x_data is None, it will use the DataFrame index.
#             if config["plot_type"].lower() in ["line", "scatter"]:
#                 if config["plot_type"].lower() == "line":
#                     fig = px.line(df, x=x_data, y=y_data,
#                                   title=f"{config['description']} ({scale} scale)",
#                                   labels={x_data if x_data else "index": config.get("x_axis", "X Axis"),
#                                           y_data: config.get("y_axis", "Y Axis")})
#                 else:  # scatter
#                     fig = px.scatter(df, x=x_data, y=y_data,
#                                      title=f"{config['description']} ({scale} scale)",
#                                      labels={x_data if x_data else "index": config.get("x_axis", "X Axis"),
#                                              y_data: config.get("y_axis", "Y Axis")})
#             elif config["plot_type"].lower() == "bar":
#                 fig = px.bar(df, x=x_data, y=y_data,
#                              title=f"{config['description']} ({scale} scale)",
#                              labels={x_data if x_data else "index": config.get("x_axis", "X Axis"),
#                                      y_data: config.get("y_axis", "Y Axis")})
#             elif "multiple lines" in config["plot_type"].lower():
#                 # Plot all columns as separate lines.
#                 fig = px.line(df, x=x_data, y=df.columns,
#                               title=f"{config['description']} ({scale} scale)",
#                               labels={x_data if x_data else "index": config.get("x_axis", "X Axis"), 
#                                       "value": "Values"})
#             else:
#                 # Default fallback to a line plot.
#                 fig = px.line(df, x=x_data, y=y_data,
#                               title=f"{config['description']} ({scale} scale)",
#                               labels={x_data if x_data else "index": config.get("x_axis", "X Axis"),
#                                       y_data: config.get("y_axis", "Y Axis")})
#             if scale == "log":
#                 fig.update_yaxes(type="log")
        
#         # Dual-axis case: config["y"] is a dict.
#         else:
#             fig = make_subplots(specs=[[{"secondary_y": True}]])
#             y_left_key = config["y"].get("left")
#             y_right_key = config["y"].get("right")
#             if y_left_key not in df.columns or y_right_key not in df.columns:
#                 print(f"Missing columns for dual-axis plot in {key}")
#                 continue
            
#             # Use x_data if available; otherwise, use the index.
#             if x_col and x_col in df.columns:
#                 x_vals = df[x_col]
#             else:
#                 x_vals = df.index
            
#             # Left trace: bar plot.
#             fig.add_trace(
#                 go.Bar(x=x_vals, y=df[y_left_key],
#                        name=config["y_axis"].get("left", "Left Y")),
#                 secondary_y=False
#             )
#             # Right trace: line plot.
#             fig.add_trace(
#                 go.Scatter(x=x_vals, y=df[y_right_key],
#                            name=config["y_axis"].get("right", "Right Y")),
#                 secondary_y=True
#             )
#             fig.update_layout(title_text=f"{config['description']} ({scale} scale)")
#             fig.update_xaxes(title_text=config.get("x_axis", "X Axis"))
#             fig.update_yaxes(title_text=config["y_axis"].get("left", "Left Y"), secondary_y=False)
#             fig.update_yaxes(title_text=config["y_axis"].get("right", "Right Y"), secondary_y=True, type=scale)
        
#         fig.show()

# Plotly example above

## Script continues below

In [None]:
def export_df(df, filename, export_format=EXPORT_FORMAT):
    if export_format == 'both':
        df.to_json(f'{filename}.json', orient='records', date_format='iso')
        df.to_csv(f'{filename}.csv')
        print(f"Data exported as {export_format} to '{filename}.json' and '{filename}.csv'.")
    elif export_format == 'json':
        df.to_json(export_filename, orient='records', date_format='iso')
        print(f"Data exported as {export_format} to '{filename}.json'.")
    elif export_format == 'csv':
        df.to_csv(export_filename)
        print(f"Data exported as {export_format} to '{filename}.csv'.")

In [None]:
def calculate_turnaround_time(data):
    plot_data = []

    for entry in data:
        if entry['turnaround_time'] is not None:
            created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
            if entry['turnaround_time']: # TODO: Do I really want to ignore entries, that are still open? Perhaps it might be better to just set the turnaround time to the current date - created_at OR do I just wanna plot them seperately to indicate that there are (long left) open issues/ PRs
                turnaround_time_days = entry['turnaround_time'] / (60 * 60 * 24)  # Convert seconds to days
                plot_data.append((created_at, turnaround_time_days))

    # Create a DataFrame
    df = pd.DataFrame(plot_data, columns=['created_at', 'turnaround_time'])

    # Convert the 'created_at' column to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])

    return df

def plot_turnaround_time_yearly(data, data_type):
    
    df = calculate_turnaround_time(data)

    # Set the index to the creation date
    df.set_index('created_at', inplace=True)

    # Resample the data by month and calculate the average turnaround time
    df_resampled = df.resample('M').mean()

    # Plotting the data
    plt.figure(figsize=(12, 6))
    plt.plot(df_resampled.index, df_resampled['turnaround_time'], marker='o', linestyle='-')

    # Formatting the plot
    plt.title(f'Average {data_type} Turnaround Time Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel('Average Turnaround Time (days)')
    plt.grid(True)

    # Optional: improving the date formatting
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

print("Issues: ")
plot_turnaround_time_yearly(issues_data, "Issue")
print("Pull Requests:")
plot_turnaround_time_yearly(pull_requests_data, "Pull Request")

In [None]:
def plot_turnaround_time(data, data_type):
    
    df = calculate_turnaround_time(data)

    # Set the index to the creation date
    df.set_index('created_at', inplace=True)

    # Resample the data by month and calculate the average turnaround time
    df_resampled = df.resample('M').mean()

    # Define the number of months per x-axis label
    months_per_label = 3  # change this to any number of months that should be one label

    # Plotting the data
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(df_resampled.index, df_resampled['turnaround_time'], marker='o', linestyle='-')

    # Formatting the plot
    plt.title(f'Average {data_type} Turnaround Time Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel('Average Turnaround Time (days)')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))  # Set major ticks to every custom number of months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

print("Issues:")
plot_turnaround_time(issues_data, "Issue")
print("Pull Requests:")
plot_turnaround_time(pull_requests_data, "Pull Request")

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

def plot_turnaround_time(data, data_type, export_format=EXPORT_FORMAT):
    # Calculate the turnaround time
    df = calculate_turnaround_time(data)
    # Set the index to the creation date
    df.set_index('created_at', inplace=True)
    # Resample the data by month and calculate the average turnaround time
    df_resampled = df.resample('M').mean()

    export_df(df_resampled, f'{data_type.lower()}_turnaround_time', export_format)

    # Plotting the data (for visualization, if needed)
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(df_resampled.index, df_resampled['turnaround_time'], marker='o', linestyle='-')
    
    # Formatting the plot
    plt.title(f'Average {data_type} Turnaround Time Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel('Average Turnaround Time (days)')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    months_per_label = 3  # Adjust as desired
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS: plt.savefig(f"{data_type.lower()}_turnaround_time.png")

    # Display the plot
    plt.show()

# Example calls with data export
print("Issues:")
plot_turnaround_time(issues_data, "Issue", export_format=EXPORT_FORMAT)
print("Pull Requests:")
plot_turnaround_time(pull_requests_data, "Pull Request", export_format=EXPORT_FORMAT)

In [None]:
def plot_turnaround_time_scatter(data, data_type):
    
    df = calculate_turnaround_time(data)

    # Set the index to the creation date (optional, not necessary for scatter plot)
    # df.set_index('created_at', inplace=False)

    # Define the number of months per x-axis label
    months_per_label = 3  # You can change this to any number of months you want

    # Plotting the data
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.scatter(df['created_at'], df['turnaround_time'], alpha=0.5)

    # Formatting the plot
    plt.title(f'{data_type} Turnaround Time Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel('Turnaround Time (days)')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))  # Set major ticks to every custom number of months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

print("Issues: ")
plot_turnaround_time_scatter(issues_data, "Issue")
print("Pull Requests:")
plot_turnaround_time_scatter(pull_requests_data, "Pull Request")

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

def plot_turnaround_time_scatter(data, data_type, export_format=EXPORT_FORMAT):
    # Calculate the turnaround time
    df = calculate_turnaround_time(data)

    # Export the data if specified
    export_df(df, f'{data_type.lower()}_turnaround_time_scatter', export_format)

    # Plotting the scatter plot (for visualization, if needed)
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.scatter(df['created_at'], df['turnaround_time'], alpha=0.5)

    # Formatting the plot
    plt.title(f'{data_type} Turnaround Time Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel('Turnaround Time (days)')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    months_per_label = 3  # Adjust as desired
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS: plt.savefig(f"{data_type.lower()}_turnaround_time_scatter.png")

    # Display the plot
    plt.show()

# Example calls with data export
print("Issues:")
plot_turnaround_time_scatter(issues_data, "Issue", export_format=EXPORT_FORMAT)
print("Pull Requests:")
plot_turnaround_time_scatter(pull_requests_data, "Pull Request", export_format=EXPORT_FORMAT)

### Careful!

The plots above seem to indicate that the "health" of the repository got better over time. BUT for turnaround time open issues are completely ignored, which skews the representation!

In [None]:
# # Find out the number of issues that have been opened/ closed
# open_issues = 0
# closed_issues = 0
# for issue in issues_data:
#     if issue['state'] == 'open':
#         open_issues += 1
#     elif issue['state'] == 'closed':
#         closed_issues += 1
#     else:
#         print(f"Issue #{issue['number']} has an unknown state: {issue['state']}")
# print(f"Number of open issues: {open_issues}")
# print(f"Number of closed issues: {closed_issues}")

In [None]:
def calculate_open_to_closed_ratio(data):
    # Extract relevant data
    plot_data = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        closed_at = entry['closed_at']
        if closed_at:
            closed_at = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        plot_data.append((created_at, closed_at, state))

    # Create a DataFrame
    df = pd.DataFrame(plot_data, columns=['created_at', 'closed_at', 'state'])

    # Convert dates to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['closed_at'] = pd.to_datetime(df['closed_at'])

    # Resample data by month
    df.set_index('created_at', inplace=True)
    monthly_data = df.resample('M').apply({
        'state': lambda x: x.value_counts().get('open', 0),
        'closed_at': lambda x: x.notnull().sum()
    }).rename(columns={'state': 'open_issues', 'closed_at': 'closed_issues'})

    # Calculate cumulative sums
    monthly_data['cumulative_open'] = monthly_data['open_issues'].cumsum()
    monthly_data['cumulative_closed'] = monthly_data['closed_issues'].cumsum()

    # Calculate the ratio
    monthly_data['open_to_closed_ratio'] = monthly_data['cumulative_open'] / monthly_data['cumulative_closed']

    return monthly_data

def plot_open_to_closed_ratio(data, data_type):

    monthly_data = calculate_open_to_closed_ratio(data)

    # Define the number of months per x-axis label
    months_per_label = 3  # change this to any number of months that should be one label

    # Plot the data
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(monthly_data.index, monthly_data['open_to_closed_ratio'], marker='o', linestyle='-')

    # Formatting the plot
    plt.title(f'Open vs. Closed {data_type} Ratio Over Time')
    plt.xlabel('Time')
    plt.ylabel(f'Open to Closed {data_type} Ratio')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))  # Set major ticks to every custom number of months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

plot_open_to_closed_ratio(issues_data, 'Issue')
plot_open_to_closed_ratio(pull_requests_data, 'Pull Request')

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

def calculate_open_to_closed_ratio(data):
    # Extract relevant data
    plot_data = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        closed_at = entry['closed_at']
        if closed_at:
            closed_at = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        plot_data.append((created_at, closed_at, state))

    # Create a DataFrame
    df = pd.DataFrame(plot_data, columns=['created_at', 'closed_at', 'state'])

    # Resample data by month and calculate metrics
    df.set_index('created_at', inplace=True)
    monthly_data = df.resample('M').apply({
        'state': lambda x: x.value_counts().get('open', 0),
        'closed_at': lambda x: x.notnull().sum()
    }).rename(columns={'state': 'open_issues', 'closed_at': 'closed_issues'})

    # Calculate cumulative sums and the ratio
    monthly_data['cumulative_open'] = monthly_data['open_issues'].cumsum()
    monthly_data['cumulative_closed'] = monthly_data['closed_issues'].cumsum()
    monthly_data['open_to_closed_ratio'] = monthly_data['cumulative_open'] / monthly_data['cumulative_closed']

    return monthly_data

def plot_open_to_closed_ratio(data, data_type, export_format=EXPORT_FORMAT):
    # Generate data for plotting
    monthly_data = calculate_open_to_closed_ratio(data)

    # Export the data if specified
    export_df(monthly_data, f'{data_type.lower()}_open_to_closed_ratio', export_format)

    # Plot the ratio (for visualization, if needed)
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(monthly_data.index, monthly_data['open_to_closed_ratio'], marker='o', linestyle='-')

    # Formatting the plot
    plt.title(f'Open vs. Closed {data_type} Ratio Over Time')
    plt.xlabel('Time')
    plt.ylabel(f'Open to Closed {data_type} Ratio')
    plt.grid(True)

    # Improve the date formatting and set custom granularity
    months_per_label = 3  # Adjust as desired
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS: plt.savefig(f"{data_type.lower()}_open_to_closed_ratio.png")

    # Display the plot
    plt.show()

# Example calls with data export
plot_open_to_closed_ratio(issues_data, 'Issue', export_format=EXPORT_FORMAT)
plot_open_to_closed_ratio(pull_requests_data, 'Pull Request', export_format=EXPORT_FORMAT)


In [None]:
def calculate_open_and_closed(data):
    plot_data = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        closed_at = entry['closed_at']
        if closed_at:
            closed_at = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        plot_data.append((created_at, closed_at, state))

    # Create a DataFrame
    df = pd.DataFrame(plot_data, columns=['created_at', 'closed_at', 'state'])

    # Convert dates to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['closed_at'] = pd.to_datetime(df['closed_at'])

    # Resample data by month
    df.set_index('created_at', inplace=True)
    monthly_data = df.resample('M').apply({
        'state': lambda x: x.value_counts().get('open', 0),
        'closed_at': lambda x: x.notnull().sum()
    }).rename(columns={'state': 'open_issues', 'closed_at': 'closed_issues'})

    # Calculate cumulative sums
    monthly_data['cumulative_open'] = monthly_data['open_issues'].cumsum()
    monthly_data['cumulative_closed'] = monthly_data['closed_issues'].cumsum()

    return monthly_data

def plot_open_and_closed(data, data_type, log_scale=False):

    monthly_data = calculate_open_and_closed(data)

    # Define the number of months per x-axis label
    months_per_label = 3  # change this to any number of months that should be one label

    # Plot the data
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(monthly_data.index, monthly_data['cumulative_open'], label='Cumulative Open', marker='o', linestyle='-')
    plt.plot(monthly_data.index, monthly_data['cumulative_closed'], label='Cumulative Closed', marker='x', linestyle='-')

    # Formatting the plot
    plt.title(f'Cumulative Open and Closed {data_type}s Over Time')
    plt.xlabel('Time')
    plt.ylabel(f'Number of {data_type}s')
    plt.legend()
    plt.grid(True)

    if(log_scale):
        plt.yscale('log')
        plt.ylabel(f'Number of {data_type}s (Log Scale)')

    # Improve the date formatting and set custom granularity
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))  # Set major ticks to every custom number of months
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

plot_open_and_closed(issues_data, 'Issue', log_scale=True)
plot_open_and_closed(pull_requests_data, 'Pull Request', log_scale=True)

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

def calculate_open_and_closed(data):
    plot_data = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        closed_at = entry['closed_at']
        if closed_at:
            closed_at = datetime.strptime(closed_at, "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        plot_data.append((created_at, closed_at, state))

    # Create a DataFrame
    df = pd.DataFrame(plot_data, columns=['created_at', 'closed_at', 'state'])

    # Resample data by month
    df.set_index('created_at', inplace=True)
    monthly_data = df.resample('M').apply({
        'state': lambda x: x.value_counts().get('open', 0),
        'closed_at': lambda x: x.notnull().sum()
    }).rename(columns={'state': 'open_issues', 'closed_at': 'closed_issues'})

    # Calculate cumulative sums
    monthly_data['cumulative_open'] = monthly_data['open_issues'].cumsum()
    monthly_data['cumulative_closed'] = monthly_data['closed_issues'].cumsum()

    return monthly_data

def plot_open_and_closed(data, data_type, log_scale=False, export_format=EXPORT_FORMAT):
    # Generate data for plotting
    monthly_data = calculate_open_and_closed(data)

    # Export the data if specified
    export_df(monthly_data, f'{data_type.lower()}_open_and_closed', export_format)

    # Plot the data (for visualization, if needed)
    plt.figure(figsize=(18, 8))  # Make the plot wider
    plt.plot(monthly_data.index, monthly_data['cumulative_open'], label='Cumulative Open', marker='o', linestyle='-')
    plt.plot(monthly_data.index, monthly_data['cumulative_closed'], label='Cumulative Closed', marker='x', linestyle='-')

    # Formatting the plot
    plt.title(f'Cumulative Open and Closed {data_type}s Over Time')
    plt.xlabel('Time')
    plt.ylabel(f'Number of {data_type}s')
    plt.legend()
    plt.grid(True)

    if log_scale:
        plt.yscale('log')
        plt.ylabel(f'Number of {data_type}s (Log Scale)')

    # Improve the date formatting and set custom granularity
    months_per_label = 3  # Adjust as desired
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS:
        if not log_scale: plt.savefig(f"{data_type.lower()}_open_and_closed.png")
        else: plt.savefig(f"{data_type.lower()}_open_and_closed_log.png")

    # Display the plot
    plt.show()

# Example calls with data export
plot_open_and_closed(issues_data, 'Issue', log_scale=True, export_format=EXPORT_FORMAT)
plot_open_and_closed(pull_requests_data, 'Pull Request', log_scale=True, export_format=EXPORT_FORMAT)

In [None]:
# Extract relevant data
def extract_age(data):
    ages = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        if state == 'open':
            age = (datetime.now() - created_at).days
            ages.append((created_at, age))
    return ages

def plot_age(data, data_type):
    ages = extract_age(data)

    # Create a DataFrame
    df_age = pd.DataFrame(ages, columns=['created_at', 'age'])

    # Convert dates to datetime format
    df_age['created_at'] = pd.to_datetime(df_age['created_at'])

    # Sort by creation date
    df_age.sort_values('created_at', inplace=True)

    # Plotting the data
    plt.figure(figsize=(12, 6))
    plt.hist(df_age['age'], bins=20, edgecolor='black')

    # Formatting the plot
    plt.title(f'Distribution of Ages of Open {data_type}s')
    plt.xlabel(f'Age of Open {data_type}s (days)')
    plt.ylabel('Count')
    plt.grid(True)

    # Display the plot
    plt.show()

plot_age(issues_data, 'Issue')
plot_age(pull_requests_data, 'Pull Request')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Extract relevant data
def extract_age(data):
    ages = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        if state == 'open':
            age = (datetime.now() - created_at).days
            ages.append((created_at, age))
    return ages

def plot_age(data, data_type, export_format=EXPORT_FORMAT):
    # Extract ages of open entries
    ages = extract_age(data)

    # Create a DataFrame
    df_age = pd.DataFrame(ages, columns=['created_at', 'age'])
    df_age['created_at'] = pd.to_datetime(df_age['created_at'])

    # Export the data if specified
    export_df(df_age, f'{data_type.lower()}_ages', export_format)

    # Plotting the data (for visualization, if needed)
    plt.figure(figsize=(12, 6))
    plt.hist(df_age['age'], bins=20, edgecolor='black')

    # Formatting the plot
    plt.title(f'Distribution of Ages of Open {data_type}s')
    plt.xlabel(f'Age of Open {data_type}s (days)')
    plt.ylabel('Count')
    plt.grid(True)

    if EXPORT_PNGS: plt.savefig(f"{data_type.lower()}_ages.png")

    # Display the plot
    plt.show()

# Example calls with data export
plot_age(issues_data, 'Issue', export_format=EXPORT_FORMAT)
plot_age(pull_requests_data, 'Pull Request', export_format=EXPORT_FORMAT)

In [None]:
# Extract relevant data
def extract_open_counts(data):
    open_counts = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        if state == 'open':
            open_counts.append(created_at)
    return open_counts

def plot_open_counts(data, data_type):
    open_counts = extract_open_counts(data)

    # Create a DataFrame
    df_open_counts = pd.DataFrame(open_counts, columns=['created_at'])

    # Convert dates to datetime format
    df_open_counts['created_at'] = pd.to_datetime(df_open_counts['created_at'])

    # Group by month and count the number of open issues/PRs
    monthly_open_counts = df_open_counts.groupby(df_open_counts['created_at'].dt.to_period('M')).size().reset_index(name='count')
    monthly_open_counts['created_at'] = monthly_open_counts['created_at'].dt.to_timestamp()

    # Plotting the data
    plt.figure(figsize=(18, 8))
    plt.bar(monthly_open_counts['created_at'], monthly_open_counts['count'], width=20, align='center')

    # Formatting the plot
    plt.title(f'Number of Open {data_type}s Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel(f'Number of Open {data_type}s')
    plt.grid(True)

    # Ensure y-axis has only integer values
    plt.gca().yaxis.get_major_locator().set_params(integer=True)

    # Improve the date formatting
    months_per_label = 3  # Adjust the interval as needed
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

plot_open_counts(issues_data, 'Issue')
plot_open_counts(pull_requests_data, 'Pull Request')

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import datetime

# Extract relevant data
def extract_open_counts(data):
    open_counts = []
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        state = entry['state']
        if state == 'open':
            open_counts.append(created_at)
    return open_counts

def plot_open_counts(data, data_type, export_format=EXPORT_FORMAT):
    # Extract open item dates
    open_counts = extract_open_counts(data)

    # Create a DataFrame
    df_open_counts = pd.DataFrame(open_counts, columns=['created_at'])
    df_open_counts['created_at'] = pd.to_datetime(df_open_counts['created_at'])

    # Group by month and count the open items
    monthly_open_counts = df_open_counts.groupby(df_open_counts['created_at'].dt.to_period('M')).size().reset_index(name='count')
    monthly_open_counts['created_at'] = monthly_open_counts['created_at'].dt.to_timestamp()

    # Export the data if specified
    export_df(monthly_open_counts, f'{data_type.lower()}_open_counts', export_format)

    # Plotting the data (for visualization, if needed)
    plt.figure(figsize=(18, 8))
    plt.bar(monthly_open_counts['created_at'], monthly_open_counts['count'], width=20, align='center')

    # Formatting the plot
    plt.title(f'Number of Open {data_type}s Over Time')
    plt.xlabel('Creation Date')
    plt.ylabel(f'Number of Open {data_type}s')
    plt.grid(True)

    # Ensure y-axis has only integer values
    plt.gca().yaxis.get_major_locator().set_params(integer=True)

    # Improve the date formatting
    months_per_label = 3  # Adjust as needed
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS: plt.savefig(f"{data_type.lower()}_open_counts.png")

    # Display the plot
    plt.show()

# Example calls with data export
plot_open_counts(issues_data, 'Issue', export_format=EXPORT_FORMAT)
plot_open_counts(pull_requests_data, 'Pull Request', export_format=EXPORT_FORMAT)

In [None]:
# # Extract commit data from pull requests
# def extract_commit_data_from_prs(pull_requests_data):
#     commit_info = []
#     for pr in pull_requests_data:
#         for commit in pr['commits']:
#             commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
#             changes = sum(file['additions'] + file['deletions'] for file in pr['files'])
#             commit_info.append((commit_date, changes))
#     return commit_info

# # Plot commit frequency and size
# def plot_commit_metrics(commit_info):
#     df = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     df['commit_date'] = pd.to_datetime(df['commit_date'])

#     # Group by day
#     daily_commits = df.groupby(df['commit_date'].dt.date).size()
#     daily_changes = df.groupby(df['commit_date'].dt.date)['changes'].mean()

#     fig, ax1 = plt.subplots(figsize=(18, 8))

#     # Bar plot for daily commits
#     ax1.bar(daily_commits.index, daily_commits.values, width=0.8, alpha=0.6, label='Daily Commits', color='blue')
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Number of Commits')
#     ax1.legend(loc='upper left')
#     ax1.grid(True)

#     # Secondary axis for average commit size
#     ax2 = ax1.twinx()
#     ax2.plot(daily_changes.index, daily_changes.values, linestyle='-', color='red', label='Average Commit Size')
#     ax2.set_ylabel('Average Commit Size (lines)')
#     ax2.legend(loc='upper right')

#     plt.title('Commit Frequency and Average Commit Size Over Time')
#     plt.show()

# # Example usage:
# commit_info = extract_commit_data_from_prs(pull_requests_data)
# plot_commit_metrics(commit_info)


In [None]:
# def plot_commit_frequency_and_size(data, months_per_label=3):
#     commits = []

#     for pr in data:
#         for commit in pr['commits']:
#             commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
#             commit_size = pr['additions'] + pr['deletions']
#             commits.append((commit_date, commit_size))

#     # Create a DataFrame
#     df_commits = pd.DataFrame(commits, columns=['date', 'size'])

#     # Convert dates to datetime format
#     df_commits['date'] = pd.to_datetime(df_commits['date'])

#     # Resample data by day and calculate the number of commits and average commit size
#     df_daily = df_commits.resample('D', on='date').agg({'size': ['count', 'mean']}).reset_index()
#     df_daily.columns = ['date', 'commit_count', 'avg_commit_size']

#     # Plotting the data
#     fig, ax1 = plt.subplots(figsize=(18, 8))

#     ax1.plot(df_daily['date'], df_daily['commit_count'], color='blue', label='Daily Commits')
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Number of Commits', color='blue')
#     ax1.tick_params(axis='y', labelcolor='blue')

#     ax2 = ax1.twinx()
#     ax2.plot(df_daily['date'], df_daily['avg_commit_size'], color='red', label='Average Commit Size')
#     ax2.set_ylabel('Average Commit Size (lines)', color='red')
#     ax2.tick_params(axis='y', labelcolor='red')

#     fig.tight_layout()
#     plt.title('Commit Frequency and Average Commit Size Over Time')

#     # Improve the date formatting and set custom granularity
#     plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
#     plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
#     plt.gcf().autofmt_xdate()

#     fig.legend(loc='upper left', bbox_to_anchor=(0.1, 0.9))
#     plt.grid(True)
#     plt.show()

# plot_commit_frequency_and_size(pull_requests_data, months_per_label=3)

In [None]:
def plot_commit_frequency_and_size(data, log_scale=False):
    # Extract relevant data
    commits = []
    for pr in data:
        for commit in pr['commits']:
            date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            size = pr['additions'] + pr['deletions']
            commits.append((date, size))
    
    # Create a DataFrame
    df_commits = pd.DataFrame(commits, columns=['date', 'size'])
    
    # Resample by month
    df_commits.set_index('date', inplace=True)
    df_monthly = df_commits.resample('M').agg({'size': ['mean', 'count']})
    df_monthly.columns = ['avg_size', 'monthly_commits']
    
    # Fill NaN values with 0
    df_monthly.fillna(0, inplace=True)
    
    # Calculate rolling averages to smooth data
    df_monthly['monthly_commits'] = df_monthly['monthly_commits'].rolling(window=3, min_periods=1).mean()
    df_monthly['avg_size'] = df_monthly['avg_size'].rolling(window=3, min_periods=1).mean()
    
    # Plotting the data
    fig, ax1 = plt.subplots(figsize=(18, 8))

    ax1.bar(df_monthly.index, df_monthly['monthly_commits'], width=20, color='blue', alpha=0.5, label='Monthly Commits')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create a second y-axis for average commit size
    ax2 = ax1.twinx()
    # ax2.plot(df_monthly.index, df_monthly['avg_size'], color='red', marker='o', label='Average Commit Size')
    ax2.plot(df_monthly.index, df_monthly['avg_size'], color='red', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='red')
    if(log_scale): ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add title and legend
    plt.title('Commit Frequency and Average Commit Size Over Time')
    fig.tight_layout()
    plt.show()

# Example usage with your pull requests data
plot_commit_frequency_and_size(pull_requests_data)
plot_commit_frequency_and_size(pull_requests_data, True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def plot_commit_frequency_and_size(data, log_scale=False, export_format=EXPORT_FORMAT):
    # Extract relevant data
    commits = []
    for pr in data:
        for commit in pr['commits']:
            date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            size = pr['additions'] + pr['deletions']
            commits.append((date, size))
    
    # Create a DataFrame
    df_commits = pd.DataFrame(commits, columns=['date', 'size'])
    
    # Resample by month
    df_commits.set_index('date', inplace=True)
    df_monthly = df_commits.resample('M').agg({'size': ['mean', 'count']})
    df_monthly.columns = ['avg_size', 'monthly_commits']
    df_monthly.fillna(0, inplace=True)
    
    # Calculate rolling averages
    df_monthly['monthly_commits'] = df_monthly['monthly_commits'].rolling(window=3, min_periods=1).mean()
    df_monthly['avg_size'] = df_monthly['avg_size'].rolling(window=3, min_periods=1).mean()
    
    # Export the data if specified
    export_df(df_monthly, f'commit_frequency_and_size', export_format)

    # Plotting the data (for visualization, if needed)
    fig, ax1 = plt.subplots(figsize=(18, 8))

    ax1.bar(df_monthly.index, df_monthly['monthly_commits'], width=20, color='blue', alpha=0.5, label='Monthly Commits')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create a second y-axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(df_monthly.index, df_monthly['avg_size'], color='red', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='red')
    if log_scale:
        ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add title and layout adjustments
    plt.title('Commit Frequency and Average Commit Size Over Time')
    fig.tight_layout()

    if EXPORT_PNGS:
        if not log_scale: plt.savefig(f"commit_frequency_and_size.png")
        else: plt.savefig(f"commit_frequency_and_sized_log.png")
    plt.show()

# Example calls with data export
plot_commit_frequency_and_size(pull_requests_data, log_scale=False, export_format=EXPORT_FORMAT)
plot_commit_frequency_and_size(pull_requests_data, log_scale=True, export_format=EXPORT_FORMAT)

In [None]:
def plot_commit_frequency_and_size(data, log_scale=False):
    # Extract relevant data
    commits = []
    for pr in data:
        for commit in pr['commits']:
            date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            size = pr['additions'] + pr['deletions']
            commits.append((date, size))
    
    # Create a DataFrame
    df_commits = pd.DataFrame(commits, columns=['date', 'size'])
    
    # Sort by date and filter out initial outliers
    df_commits.sort_values('date', inplace=True)
    initial_commits = df_commits.head(20)  # Consider the first 20 commits
    remaining_commits = df_commits.iloc[20:]  # Rest of the commits
    
    # Calculate IQR for initial commits
    Q1 = initial_commits['size'].quantile(0.25)
    Q3 = initial_commits['size'].quantile(0.75)
    IQR = Q3 - Q1
    filter_outliers = (initial_commits['size'] >= (Q1 - 1.5 * IQR)) & (initial_commits['size'] <= (Q3 + 1.5 * IQR))
    filtered_initial_commits = initial_commits[filter_outliers]
    
    # Combine filtered initial commits with the rest
    df_commits_filtered = pd.concat([filtered_initial_commits, remaining_commits])
    
    # Resample by month
    df_commits_filtered.set_index('date', inplace=True)
    df_monthly = df_commits_filtered.resample('M').agg({'size': ['mean', 'count']})
    df_monthly.columns = ['avg_size', 'monthly_commits']
    
    # Fill NaN values with 0
    df_monthly.fillna(0, inplace=True)
    
    # Calculate rolling averages to smooth data
    df_monthly['monthly_commits'] = df_monthly['monthly_commits'].rolling(window=3, min_periods=1).mean()
    df_monthly['avg_size'] = df_monthly['avg_size'].rolling(window=3, min_periods=1).mean()
    
    # Plotting the data
    fig, ax1 = plt.subplots(figsize=(18, 8))

    ax1.bar(df_monthly.index, df_monthly['monthly_commits'], width=20, color='blue', alpha=0.5, label='Monthly Commits')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create a second y-axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(df_monthly.index, df_monthly['avg_size'], color='red', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='red')
    if log_scale: ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add title and legend
    plt.title('Commit Frequency and Average Commit Size Over Time')
    fig.tight_layout()
    plt.show()

# Example usage with your pull requests data
plot_commit_frequency_and_size(pull_requests_data)
plot_commit_frequency_and_size(pull_requests_data, True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

def plot_commit_frequency_and_size(data, log_scale=False, export_format=EXPORT_FORMAT):
    # Extract relevant data
    commits = []
    for pr in data:
        for commit in pr['commits']:
            date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            size = pr['additions'] + pr['deletions']
            commits.append((date, size))
    
    # Create a DataFrame
    df_commits = pd.DataFrame(commits, columns=['date', 'size'])
    
    # Sort by date and filter out initial outliers
    df_commits.sort_values('date', inplace=True)
    initial_commits = df_commits.head(20)
    remaining_commits = df_commits.iloc[20:]

    # Calculate IQR for initial commits
    Q1 = initial_commits['size'].quantile(0.25)
    Q3 = initial_commits['size'].quantile(0.75)
    IQR = Q3 - Q1
    filter_outliers = (initial_commits['size'] >= (Q1 - 1.5 * IQR)) & (initial_commits['size'] <= (Q3 + 1.5 * IQR))
    filtered_initial_commits = initial_commits[filter_outliers]
    
    # Combine filtered initial commits with the rest
    df_commits_filtered = pd.concat([filtered_initial_commits, remaining_commits])
    
    # Resample by month
    df_commits_filtered.set_index('date', inplace=True)
    df_monthly = df_commits_filtered.resample('M').agg({'size': ['mean', 'count']})
    df_monthly.columns = ['avg_size', 'monthly_commits']
    df_monthly.fillna(0, inplace=True)
    
    # Calculate rolling averages
    df_monthly['monthly_commits'] = df_monthly['monthly_commits'].rolling(window=3, min_periods=1).mean()
    df_monthly['avg_size'] = df_monthly['avg_size'].rolling(window=3, min_periods=1).mean()
    
    # Export the data if specified
    export_df(df_monthly, f'commit_frequency_and_size_filtered', export_format)

    # Plotting the data (for visualization, if needed)
    fig, ax1 = plt.subplots(figsize=(18, 8))

    ax1.bar(df_monthly.index, df_monthly['monthly_commits'], width=20, color='blue', alpha=0.5, label='Monthly Commits')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # Create a second y-axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(df_monthly.index, df_monthly['avg_size'], color='red', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='red')
    if log_scale:
        ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='red')

    # Add title and layout adjustments
    plt.title('Commit Frequency and Average Commit Size Over Time (Filtered)')
    fig.tight_layout()

    if EXPORT_PNGS:
        if not log_scale: plt.savefig(f"commit_frequency_and_size_filtered.png")
        else: plt.savefig(f"commit_frequency_and_size_filtered_log.png")
    plt.show()

# Example calls with data export
plot_commit_frequency_and_size(pull_requests_data, log_scale=False, export_format=EXPORT_FORMAT)
plot_commit_frequency_and_size(pull_requests_data, log_scale=True, export_format=EXPORT_FORMAT)

## Label Analysis

### Disclaimer: This is specific to the vaadin/flow repo and won't for others as the label names often differ between different repos

In [None]:
def collect_labels(data):
    labels = {}
    for entry in data:
        if(entry['labels']):
            for label in entry['labels']:
                labels[label] = labels.get(label, 0) + 1
    return labels

labels = collect_labels(issues_data)
labels

In [None]:
# Define label groups and merge 'enhancement' and 'feature request'
label_groups = {
    'Requests': ['bug', 'enhancement', 'feature request', 'documentation', 'question', 'help wanted'],
    'Impact': ['Impact: High', 'Impact: Low'],
    'Severity': ['Severity: Blocker', 'Severity: Minor'],
    'Code Quality': ['code quality', 'refactor', 'breaking change']
}

# Extract relevant data
def extract_label_data(data, label_groups):
    label_data = {group: [] for group in label_groups.keys()}
    
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        labels = entry['labels']
        for label in labels:
            for group, labels_list in label_groups.items():
                if label in labels_list:
                    # Merge 'enhancement' and 'feature request' labels
                    if label in ['enhancement', 'feature request']:
                        label = 'enhancement'
                    if label in ['documentation', 'question', 'help wanted']:
                        label = 'support/documentation'
                    label_data[group].append((created_at, label))
    
    return label_data

def plot_label_group(data, label_group, group_name, months_per_label=3):
    df = pd.DataFrame(data, columns=['created_at', 'label'])
    
    # Convert dates to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    # Resample data by month
    df.set_index('created_at', inplace=True)
    monthly_data = df.groupby([pd.Grouper(freq='M'), 'label']).size().unstack(fill_value=0)
    
    # Plotting the data
    monthly_data.plot(figsize=(18, 8), marker='o')

    # Formatting the plot
    plt.title(f'{group_name} Over Time')
    plt.xlabel('Time')
    plt.ylabel('Count')
    plt.grid(True)
    plt.legend(title=group_name)

    # Improve the date formatting
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

# Extract label data
label_data = extract_label_data(issues_data, label_groups)

# Plot each label group
for group_name, data in label_data.items():
    plot_label_group(data, label_groups[group_name], group_name)

In [None]:
import pandas as pd
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
from datetime import datetime

# Define label groups and merge 'enhancement' and 'feature request'
label_groups = {
    'Requests': ['bug', 'enhancement', 'feature request', 'documentation', 'question', 'help wanted'],
    'Impact': ['Impact: High', 'Impact: Low'],
    'Severity': ['Severity: Blocker', 'Severity: Minor'],
    'Code Quality': ['code quality', 'refactor', 'breaking change']
}

# Extract relevant data
def extract_label_data(data, label_groups):
    label_data = {group: [] for group in label_groups.keys()}
    
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        labels = entry['labels']
        for label in labels:
            for group, labels_list in label_groups.items():
                if label in labels_list:
                    # Merge 'enhancement' and 'feature request' labels
                    if label in ['enhancement', 'feature request']:
                        label = 'enhancement'
                    if label in ['documentation', 'question', 'help wanted']:
                        label = 'support/documentation'
                    label_data[group].append((created_at, label))
    
    return label_data

def plot_label_group(data, label_group, group_name, months_per_label=3, export_format=EXPORT_FORMAT):
    df = pd.DataFrame(data, columns=['created_at', 'label'])
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    # Resample data by month and count occurrences of each label
    df.set_index('created_at', inplace=True)
    monthly_data = df.groupby([pd.Grouper(freq='M'), 'label']).size().unstack(fill_value=0)
    
    # Export the data if specified
    export_df(monthly_data, f'{group_name.lower().replace(" ", "_")}_label_data', export_format)

    # Plotting the data (for visualization, if needed)
    monthly_data.plot(figsize=(18, 8), marker='o')

    # Formatting the plot
    plt.title(f'{group_name} Over Time')
    plt.xlabel('Time')
    plt.ylabel('Count')
    plt.grid(True)
    plt.legend(title=group_name)

    # Improve the date formatting
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()

    if EXPORT_PNGS: plt.savefig(f"{group_name.lower().replace(' ', '_')}_label_data.png")

    # Display the plot
    plt.show()

# Extract label data
label_data = extract_label_data(issues_data, label_groups)

# Plot and export each label group
for group_name, data in label_data.items():
    plot_label_group(data, label_groups[group_name], group_name, export_format=EXPORT_FORMAT)

### Finding out what caused the spike in Bugs in 2019

In [None]:
# # Convert issues data to DataFrame
# issues_df = pd.DataFrame(issues_data)

# # Convert 'created_at' to datetime
# issues_df['created_at'] = pd.to_datetime(issues_df['created_at'])

# # Filter issues by date range
# start_date = '2019-06-16' # 2 weeks before the time interval of the spike started
# end_date = '2019-09-30' # end of the time interval of the spike
# filtered_issues = issues_df[(issues_df['created_at'] >= start_date) & (issues_df['created_at'] < end_date)]

# # Further filter by 'bug' label
# bug_issues = filtered_issues[filtered_issues['labels'].apply(lambda x: 'bug' in x)]

# # Print titles and descriptions of bug issues in the specified date range
# for index, issue in bug_issues.iterrows():
#     print(f"Issue #{issue['number']}: {issue['title']}")
#     print(f"Description: {issue['description']}\n")

# # Analyze the corresponding pull requests
# pr_numbers = [pr['number'] for pr in pull_requests_data if any(fixes_issue in issue['description'] for fixes_issue in bug_issues['number'].astype(str))]

# # Print the related pull requests
# for pr in pull_requests_data:
#     if pr['number'] in pr_numbers:
#         print(f"PR #{pr['number']}: {pr['title']}")
#         print(f"Description: {pr['description']}\n")


In [None]:
# # Define the time interval
# start_date = datetime(2019, 7, 16)
# end_date = datetime(2019, 8, 31)

# # Convert issues data to DataFrame
# issues_df = pd.DataFrame(issues_data)
# issues_df['created_at'] = pd.to_datetime(issues_df['created_at'])
# issues_df['closed_at'] = pd.to_datetime(issues_df['closed_at'])

# # Convert pull requests data to DataFrame
# prs_df = pd.DataFrame(pull_requests_data)
# prs_df['created_at'] = pd.to_datetime(prs_df['created_at'])
# prs_df['merged_at'] = pd.to_datetime(prs_df['merged_at'])
# prs_df['closed_at'] = pd.to_datetime(prs_df['closed_at'])

# # Filter data by the defined interval
# filtered_issues = issues_df[(issues_df['created_at'] >= start_date) & (issues_df['created_at'] <= end_date)]
# filtered_prs = prs_df[(prs_df['created_at'] >= start_date) & (prs_df['created_at'] <= end_date)]

# def plot_counts_over_time(issues_df, prs_df):
#     plt.figure(figsize=(18, 8))
    
#     # Plot issues
#     issues_counts = issues_df.groupby(issues_df['created_at'].dt.date).size()
#     plt.plot(issues_counts.index, issues_counts.values, linestyle='-', label='Issues')
    
#     # Plot pull requests
#     prs_counts = prs_df.groupby(prs_df['created_at'].dt.date).size()
#     plt.plot(prs_counts.index, prs_counts.values, linestyle='-', label='Pull Requests')
    
#     plt.title('Number of Issues and Pull Requests Over Time')
#     plt.xlabel('Date')
#     plt.ylabel('Count')
#     plt.legend()
#     plt.grid(True)
#     plt.show()

# plot_counts_over_time(filtered_issues, filtered_prs)

# def plot_turnaround_time(issues_df):
#     turnaround_times = issues_df[issues_df['closed_at'].notnull()].copy()
#     turnaround_times['turnaround_time'] = (turnaround_times['closed_at'] - turnaround_times['created_at']).dt.total_seconds() / 3600  # convert to hours

#     plt.figure(figsize=(18, 8))
#     plt.plot(turnaround_times['created_at'], turnaround_times['turnaround_time'], linestyle='-', label='Turnaround Time')

#     plt.title('Turnaround Time for Issues Over Time')
#     plt.xlabel('Creation Date')
#     plt.ylabel('Turnaround Time (hours)')
#     plt.legend()
#     plt.grid(True)
#     plt.show()

# plot_turnaround_time(filtered_issues)

# def plot_open_closed_ratio(issues_df):
#     issues_df['state'] = issues_df['closed_at'].apply(lambda x: 'closed' if pd.notnull(x) else 'open')
#     daily_open_issues = issues_df[issues_df['state'] == 'open'].groupby(issues_df['created_at'].dt.date).size().cumsum()
#     daily_closed_issues = issues_df[issues_df['state'] == 'closed'].groupby(issues_df['created_at'].dt.date).size().cumsum()

#     plt.figure(figsize=(18, 8))
#     plt.plot(daily_open_issues.index, daily_open_issues.values, linestyle='-', label='Open Issues')
#     plt.plot(daily_closed_issues.index, daily_closed_issues.values, linestyle='-', label='Closed Issues')

#     plt.title('Open vs. Closed Issues Over Time')
#     plt.xlabel('Date')
#     plt.ylabel('Cumulative Count')
#     plt.legend()
#     plt.grid(True)
#     plt.show()

# plot_open_closed_ratio(filtered_issues)


In [None]:
def extract_commit_data_from_prs(pull_requests_data):
    commit_info = []
    for pr in pull_requests_data:
        for commit in pr['commits']:
            commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            # Calculate the changes as the sum of additions and deletions
            changes = sum(file['additions'] + file['deletions'] for file in pr['files'])
            commit_info.append((commit_date, changes))
    return commit_info

# Example usage:
commit_info = extract_commit_data_from_prs(pull_requests_data)

# Filter commits by a specific time interval (if needed)
start_date = datetime(2019, 7, 16)
end_date = datetime(2019, 8, 31)
filtered_commits = [commit for commit in commit_info if start_date <= commit[0] <= end_date]

def plot_commit_metrics(commit_info):
    df = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df['commit_date'] = pd.to_datetime(df['commit_date'])

    # Group by day
    daily_commits = df.groupby(df['commit_date'].dt.date).size()
    daily_changes = df.groupby(df['commit_date'].dt.date)['changes'].mean()

    fig, ax1 = plt.subplots(figsize=(18, 8))

    # Bar plot for daily commits
    ax1.bar(daily_commits.index, daily_commits.values, width=0.8, alpha=0.6, label='Daily Commits', color='blue')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits')
    ax1.legend(loc='upper left')
    ax1.grid(True)

    # Secondary axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(daily_changes.index, daily_changes.values, marker='o', linestyle='-', color='red', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)')
    ax2.legend(loc='upper right')

    plt.title('Commit Frequency and Average Commit Size Over Time')
    plt.show()

# Example usage:
plot_commit_metrics(filtered_commits)

In [None]:
# # Define label groups and merge 'enhancement' and 'feature request'
# label_groups = {'Requests': ['enhancement', 'feature request', 'bug']}

# # Extract relevant data
# def extract_label_data(data, label_groups):
#     label_data = {group: [] for group in label_groups.keys()}
    
#     for entry in data:
#         created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
#         labels = entry['labels']
#         for label in labels:
#             for group, labels_list in label_groups.items():
#                 if label in labels_list:
#                     # Merge 'enhancement' and 'feature request' labels
#                     if label in ['enhancement', 'feature request']:
#                         label = 'enhancement/feature request'
#                     label_data[group].append((created_at, label))
    
#     return label_data

# def plot_label_group(data, label_group, group_name, months_per_label=3):
#     df = pd.DataFrame(data, columns=['created_at', 'label'])
    
#     # Convert dates to datetime format
#     df['created_at'] = pd.to_datetime(df['created_at'])
    
#     # Resample data by month
#     df.set_index('created_at', inplace=True)
#     monthly_data = df.groupby([pd.Grouper(freq='M'), 'label']).size().unstack(fill_value=0)
    
#     # Plotting the data
#     monthly_data.plot(figsize=(18, 8), marker='o')

#     # Formatting the plot
#     plt.title(f'{group_name} Over Time')
#     plt.xlabel('Time')
#     plt.ylabel('Count')
#     plt.grid(True)
#     plt.legend(title=group_name)

#     # Improve the date formatting
#     plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
#     plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
#     plt.gcf().autofmt_xdate()  # Rotate date labels

#     # Display the plot
#     plt.show()

# # Extract label data
# label_data = extract_label_data(issues_data, label_groups)

# # Extract commit data
# def extract_commit_data_from_prs(pull_requests_data):
#     commit_info = []
#     for pr in pull_requests_data:
#         for commit in pr['commits']:
#             commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
#             # Calculate the changes as the sum of additions and deletions
#             changes = pr['additions'] + pr['deletions']
#             commit_info.append((commit_date, changes))
#     return commit_info

# commit_info = extract_commit_data_from_prs(pull_requests_data)

# # Filter commits by date and size to remove initial outliers
# def filter_initial_outliers(commit_info, threshold=0.01):
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     initial_period_end = df_commits['commit_date'].min() + pd.Timedelta(days=30)
#     initial_commits = df_commits[df_commits['commit_date'] <= initial_period_end]
#     filtered_commits = df_commits[df_commits['commit_date'] > initial_period_end]

#     if not initial_commits.empty:
#         upper_limit = initial_commits['changes'].quantile(1 - threshold)
#         filtered_initial_commits = initial_commits[initial_commits['changes'] <= upper_limit]
#         return pd.concat([filtered_initial_commits, filtered_commits])
#     return df_commits

# filtered_commit_info = filter_initial_outliers(commit_info)

# def plot_commit_metrics_with_bugs(commit_info, bug_issues, start_date, end_date, log_scale=False):
#     # Convert commit info to DataFrame
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])
    
#     # Filter by date
#     df_commits = df_commits[(df_commits['commit_date'] >= start_date) & (df_commits['commit_date'] <= end_date)]

#     # Group by day
#     daily_commits = df_commits.groupby(df_commits['commit_date'].dt.date).size()
#     daily_changes = df_commits.groupby(df_commits['commit_date'].dt.date)['changes'].mean()
    
#     # Convert bug issues to DataFrame and filter by date
#     df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
#     df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
#     df_bugs = df_bugs[(df_bugs['created_at'] >= start_date) & (df_bugs['created_at'] <= end_date)]
    
#     # Group bug issues by day
#     daily_bugs = df_bugs.groupby(df_bugs['created_at'].dt.date).size()
    
#     fig, ax1 = plt.subplots(figsize=(18, 8))

#     # Bar plot for daily commits
#     ax1.bar(daily_commits.index, daily_commits.values, width=0.8, alpha=0.6, label='Daily Commits', color='blue')
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Number of Commits')
#     ax1.legend(loc='upper left')
#     ax1.grid(True)

#     # Plot bug issues
#     ax1.plot(daily_bugs.index, daily_bugs.values, 'g--', marker='x', label='Bug Issues')
    
#     # Secondary axis for average commit size
#     ax2 = ax1.twinx()
#     ax2.plot(daily_changes.index, daily_changes.values, marker='o', linestyle='-', color='red', label='Average Commit Size')
#     ax2.set_ylabel('Average Commit Size (lines)')
#     if log_scale: ax2.set_yscale('log')
#     ax2.legend(loc='upper right')

#     plt.title('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
#     plt.show()

# # Define the time interval for the spike
# start_date = datetime(2019, 7, 16)
# end_date = datetime(2019, 8, 31)

# # Filter commits and bug issues
# filtered_commit_info = filter_initial_outliers(commit_info)
# bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]
# filtered_bug_issues = [issue for issue in bug_issues if start_date <= datetime.strptime(issue['created_at'], "%Y-%m-%dT%H:%M:%S") <= end_date]

# # Plot commit metrics with bug issues
# plot_commit_metrics_with_bugs(filtered_commit_info, filtered_bug_issues, start_date, end_date)
# plot_commit_metrics_with_bugs(filtered_commit_info, filtered_bug_issues, start_date, end_date, True)

In [None]:
# Define label groups and merge 'enhancement' and 'feature request'
label_groups = {'Requests': ['enhancement', 'feature request', 'bug', 'documentation', 'question', 'help wanted']}

# Extract relevant data
def extract_label_data(data, label_groups):
    label_data = {group: [] for group in label_groups.keys()}
    
    for entry in data:
        created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
        labels = entry['labels']
        for label in labels:
            for group, labels_list in label_groups.items():
                if label in labels_list:
                    # Merge 'enhancement' and 'feature request' labels
                    if label in ['enhancement', 'feature request']:
                        label = 'enhancement'
                    if label in ['documentation', 'question', 'help wanted']:
                        label = 'support/documentation'
                    label_data[group].append((created_at, label))
    
    return label_data

def plot_label_group(data, label_group, group_name, months_per_label=3):
    df = pd.DataFrame(data, columns=['created_at', 'label'])
    
    # Convert dates to datetime format
    df['created_at'] = pd.to_datetime(df['created_at'])
    
    # Resample data by month
    df.set_index('created_at', inplace=True)
    monthly_data = df.groupby([pd.Grouper(freq='M'), 'label']).size().unstack(fill_value=0)
    
    # Plotting the data
    monthly_data.plot(figsize=(18, 8), marker='o')

    # Formatting the plot
    plt.title(f'{group_name} Over Time')
    plt.xlabel('Time')
    plt.ylabel('Count')
    plt.grid(True)
    plt.legend(title=group_name)

    # Improve the date formatting
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gcf().autofmt_xdate()  # Rotate date labels

    # Display the plot
    plt.show()

# Extract label data
label_data = extract_label_data(issues_data, label_groups)

# Extract commit data
def extract_commit_data_from_prs(pull_requests_data):
    commit_info = []
    for pr in pull_requests_data:
        for commit in pr['commits']:
            commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            # Calculate the changes as the sum of additions and deletions
            changes = pr['additions'] + pr['deletions']
            commit_info.append((commit_date, changes))
    return commit_info

commit_info = extract_commit_data_from_prs(pull_requests_data)

# Filter commits by date and size to remove initial outliers
def filter_initial_outliers(commit_info, threshold=0.01):
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    initial_period_end = df_commits['commit_date'].min() + pd.Timedelta(days=30)
    initial_commits = df_commits[df_commits['commit_date'] <= initial_period_end]
    filtered_commits = df_commits[df_commits['commit_date'] > initial_period_end]

    if not initial_commits.empty:
        upper_limit = initial_commits['changes'].quantile(1 - threshold)
        filtered_initial_commits = initial_commits[initial_commits['changes'] <= upper_limit]
        return pd.concat([filtered_initial_commits, filtered_commits])
    return df_commits

filtered_commit_info = filter_initial_outliers(commit_info)

def plot_commit_metrics_with_bugs(commit_info, bug_issues, start_date, end_date, log_scale=False):
    # Convert commit info to DataFrame
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])
    
    # Filter by date
    df_commits = df_commits[(df_commits['commit_date'] >= start_date) & (df_commits['commit_date'] <= end_date)]

    # Group by day
    daily_commits = df_commits.groupby(df_commits['commit_date'].dt.date).size()
    daily_changes = df_commits.groupby(df_commits['commit_date'].dt.date)['changes'].mean()
    
    # Convert bug issues to DataFrame and filter by date
    df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
    df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
    df_bugs = df_bugs[(df_bugs['created_at'] >= start_date) & (df_bugs['created_at'] <= end_date)]
    
    # Group bug issues by day
    daily_bugs = df_bugs.groupby(df_bugs['created_at'].dt.date).size()
    
    fig, ax1 = plt.subplots(figsize=(18, 8))

    # Bar plot for daily commits
    ax1.bar(daily_commits.index, daily_commits.values, width=0.8, alpha=0.6, label='Daily Commits', color='green')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    ax1.grid(True)

    # Secondary axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(daily_changes.index, daily_changes.values, marker='o', linestyle='-', color='blue', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='blue')
    if log_scale: ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='blue')

    # Third axis for bug issues
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.plot(daily_bugs.index, daily_bugs.values, 'r-', marker='x', label='Bug Issues')
    ax3.set_ylabel('Bug Issues', color='red')
    ax3.tick_params(axis='y', labelcolor='red')
    # Set the y-scale for bug issues to display integers only
    ax3.yaxis.get_major_locator().set_params(integer=True)

    # Add title and legend
    fig.suptitle('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 1))
    plt.show()

# Define the time interval for the spike
start_date = datetime(2019, 7, 16)
end_date = datetime(2019, 8, 31)

# Filter commits and bug issues
filtered_commit_info = filter_initial_outliers(commit_info)
bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]
filtered_bug_issues = [issue for issue in bug_issues if start_date <= datetime.strptime(issue['created_at'], "%Y-%m-%dT%H:%M:%S") <= end_date]

# Plot commit metrics with bug issues
plot_commit_metrics_with_bugs(filtered_commit_info, filtered_bug_issues, start_date, end_date)
plot_commit_metrics_with_bugs(filtered_commit_info, filtered_bug_issues, start_date, end_date, True)

In [None]:
# # Define label groups and merge 'enhancement' and 'feature request'
# label_groups = {'Requests': ['enhancement', 'feature request', 'bug', 'documentation', 'question', 'help wanted']}

# # Extract relevant data
# def extract_label_data(data, label_groups):
#     label_data = {group: [] for group in label_groups.keys()}
    
#     for entry in data:
#         created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
#         labels = entry['labels']
#         for label in labels:
#             for group, labels_list in label_groups.items():
#                 if label in labels_list:
#                     # Merge 'enhancement' and 'feature request' labels
#                     if label in ['enhancement', 'feature request']:
#                         label = 'enhancement'
#                     if label in ['documentation', 'question', 'help wanted']:
#                         label = 'support/documentation'
#                     label_data[group].append((created_at, label))
    
#     return label_data

# def plot_label_group(data, label_group, group_name, months_per_label=3):
#     df = pd.DataFrame(data, columns=['created_at', 'label'])
    
#     # Convert dates to datetime format
#     df['created_at'] = pd.to_datetime(df['created_at'])
    
#     # Resample data by the specified number of months
#     resample_rule = f'{months_per_label}M'
#     df.set_index('created_at', inplace=True)
#     monthly_data = df.groupby([pd.Grouper(freq=resample_rule), 'label']).size().unstack(fill_value=0)
    
#     # Plotting the data
#     monthly_data.plot(figsize=(18, 8), marker='o')

#     # Formatting the plot
#     plt.title(f'{group_name} Over Time')
#     plt.xlabel('Time')
#     plt.ylabel('Count')
#     plt.grid(True)
#     plt.legend(title=group_name)

#     # Improve the date formatting
#     plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
#     plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
#     plt.gcf().autofmt_xdate()  # Rotate date labels

#     # Display the plot
#     plt.show()

# # Extract label data
# label_data = extract_label_data(issues_data, label_groups)

# # Extract commit data
# def extract_commit_data_from_prs(pull_requests_data):
#     commit_info = []
#     for pr in pull_requests_data:
#         for commit in pr['commits']:
#             commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
#             # Calculate the changes as the sum of additions and deletions
#             changes = pr['additions'] + pr['deletions']
#             commit_info.append((commit_date, changes))
#     return commit_info

# commit_info = extract_commit_data_from_prs(pull_requests_data)

# # Filter commits by date and size to remove initial outliers
# def filter_initial_outliers(commit_info, threshold=0.01):
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     initial_period_end = df_commits['commit_date'].min() + pd.Timedelta(days=30)
#     initial_commits = df_commits[df_commits['commit_date'] <= initial_period_end]
#     filtered_commits = df_commits[df_commits['commit_date'] > initial_period_end]

#     if not initial_commits.empty:
#         upper_limit = initial_commits['changes'].quantile(1 - threshold)
#         filtered_initial_commits = initial_commits[initial_commits['changes'] <= upper_limit]
#         return pd.concat([filtered_initial_commits, filtered_commits])
#     return df_commits

# filtered_commit_info = filter_initial_outliers(commit_info)

# def plot_commit_metrics_with_bugs(commit_info, bug_issues, months_per_label=3, log_scale=False):
#     # Convert commit info to DataFrame
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])
    
#     # Group by the specified number of months
#     resample_rule = f'{months_per_label}M'
#     monthly_commits = df_commits.groupby([pd.Grouper(freq=resample_rule, key='commit_date')]).size()
#     monthly_changes = df_commits.groupby([pd.Grouper(freq=resample_rule, key='commit_date')])['changes'].mean()
    
#     # Convert bug issues to DataFrame
#     df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
#     df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
    
#     # Group bug issues by the specified number of months
#     monthly_bugs = df_bugs.groupby([pd.Grouper(freq=resample_rule, key='created_at')]).size()
    
#     fig, ax1 = plt.subplots(figsize=(18, 8))

#     # Bar plot for monthly commits
#     ax1.bar(monthly_commits.index, monthly_commits.values, width=20, alpha=0.6, label='Monthly Commits', color='green')
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Number of Commits', color='green')
#     ax1.tick_params(axis='y', labelcolor='green')
#     ax1.grid(True)

#     # Secondary axis for average commit size
#     ax2 = ax1.twinx()
#     ax2.plot(monthly_changes.index, monthly_changes.values, marker='o', linestyle='-', color='blue', label='Average Commit Size')
#     ax2.set_ylabel('Average Commit Size (lines)', color='blue')
#     if log_scale: ax2.set_yscale('log')
#     ax2.tick_params(axis='y', labelcolor='blue')

#     # Third axis for bug issues
#     ax3 = ax1.twinx()
#     ax3.spines['right'].set_position(('outward', 60))
#     ax3.plot(monthly_bugs.index, monthly_bugs.values, 'r-', marker='x', label='Bug Issues')
#     ax3.set_ylabel('Bug Issues', color='red')
#     ax3.tick_params(axis='y', labelcolor='red')
#     # Set the y-scale for bug issues to display integers only
#     ax3.yaxis.get_major_locator().set_params(integer=True)

#     # Add title and legend
#     fig.suptitle('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
#     fig.tight_layout()
#     fig.legend(loc='upper left', bbox_to_anchor=(0.1, 1))
#     plt.show()

# # Extract the whole time period
# start_date = datetime.strptime(issues_data[0]['created_at'], "%Y-%m-%dT%H:%M:%S")
# end_date = datetime.strptime(issues_data[-1]['created_at'], "%Y-%m-%dT%H:%M:%S")

# # Filter bug issues
# bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]

# # Plot commit metrics with bug issues for the whole period, grouped by 3 months (can be changed as needed)
# plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3)
# plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3, log_scale=True)

In [None]:
# # Define label groups and merge 'enhancement' and 'feature request'
# label_groups = {'Requests': ['enhancement', 'feature request', 'bug', 'documentation', 'question', 'help wanted']}

# # Extract relevant data
# def extract_label_data(data, label_groups):
#     label_data = {group: [] for group in label_groups.keys()}
    
#     for entry in data:
#         created_at = datetime.strptime(entry['created_at'], "%Y-%m-%dT%H:%M:%S")
#         labels = entry['labels']
#         for label in labels:
#             for group, labels_list in label_groups.items():
#                 if label in labels_list:
#                     # Merge 'enhancement' and 'feature request' labels
#                     if label in ['enhancement', 'feature request']:
#                         label = 'enhancement'
#                     if label in ['documentation', 'question', 'help wanted']:
#                         label = 'support/documentation'
#                     label_data[group].append((created_at, label))
    
#     return label_data

# def plot_label_group(data, label_group, group_name, months_per_label=3):
#     df = pd.DataFrame(data, columns=['created_at', 'label'])
    
#     # Convert dates to datetime format
#     df['created_at'] = pd.to_datetime(df['created_at'])
    
#     # Resample data by month
#     df.set_index('created_at', inplace=True)
#     monthly_data = df.groupby([pd.Grouper(freq='M'), 'label']).size().unstack(fill_value=0)
    
#     # Plotting the data
#     monthly_data.plot(figsize=(18, 8), marker='o')

#     # Formatting the plot
#     plt.title(f'{group_name} Over Time')
#     plt.xlabel('Time')
#     plt.ylabel('Count')
#     plt.grid(True)
#     plt.legend(title=group_name)

#     # Improve the date formatting
#     plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=months_per_label))
#     plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
#     plt.gcf().autofmt_xdate()  # Rotate date labels

#     # Display the plot
#     plt.show()

# # Extract label data
# label_data = extract_label_data(issues_data, label_groups)

# # Extract commit data
# def extract_commit_data_from_prs(pull_requests_data):
#     commit_info = []
#     for pr in pull_requests_data:
#         for commit in pr['commits']:
#             commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
#             # Calculate the changes as the sum of additions and deletions
#             changes = pr['additions'] + pr['deletions']
#             commit_info.append((commit_date, changes))
#     return commit_info

# commit_info = extract_commit_data_from_prs(pull_requests_data)

# # Filter initial outliers based on the condition provided
# def filter_initial_outliers(commit_info):
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     mean_commit_size = df_commits['changes'].mean()
    
#     # Find the index of the first commit that is smaller than the mean of subsequent commits
#     for idx, changes in enumerate(df_commits['changes']):
#         if changes < mean_commit_size:
#             return df_commits.iloc[idx:]
#     return df_commits

# filtered_commit_info = filter_initial_outliers(commit_info)

# def plot_commit_metrics_with_bugs(commit_info, bug_issues, months_per_label=3, log_scale=False):
#     # Convert commit info to DataFrame
#     df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
#     df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])

#     # Resample data by month
#     df_commits.set_index('commit_date', inplace=True)
#     df_commits_monthly = df_commits.resample('M').agg({'changes': ['mean', 'count']})
#     df_commits_monthly.columns = ['avg_size', 'monthly_commits']
    
#     # Fill NaN values with 0
#     df_commits_monthly.fillna(0, inplace=True)

#     # Smooth data with rolling average
#     df_commits_monthly['monthly_commits'] = df_commits_monthly['monthly_commits'].rolling(window=3, min_periods=1).mean()
#     df_commits_monthly['avg_size'] = df_commits_monthly['avg_size'].rolling(window=3, min_periods=1).mean()
    
#     # Convert bug issues to DataFrame and resample by month
#     df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
#     df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
#     df_bugs.set_index('created_at', inplace=True)
#     df_bugs_monthly = df_bugs.resample('M').size()
    
#     fig, ax1 = plt.subplots(figsize=(18, 8))

#     # Bar plot for monthly commits
#     ax1.bar(df_commits_monthly.index, df_commits_monthly['monthly_commits'], width=20, color='green', alpha=0.6, label='Monthly Commits')
#     ax1.set_xlabel('Date')
#     ax1.set_ylabel('Number of Commits', color='green')
#     ax1.tick_params(axis='y', labelcolor='green')
#     ax1.grid(True)

#     # Secondary axis for average commit size
#     ax2 = ax1.twinx()
#     ax2.plot(df_commits_monthly.index, df_commits_monthly['avg_size'], marker='o', linestyle='-', color='blue', label='Average Commit Size')
#     ax2.set_ylabel('Average Commit Size (lines)', color='blue')
#     if log_scale: ax2.set_yscale('log')
#     ax2.tick_params(axis='y', labelcolor='blue')

#     # Third axis for bug issues
#     ax3 = ax1.twinx()
#     ax3.spines['right'].set_position(('outward', 60))
#     ax3.plot(df_bugs_monthly.index, df_bugs_monthly.values, 'r-', marker='x', label='Bug Issues')
#     ax3.set_ylabel('Bug Issues', color='red')
#     ax3.tick_params(axis='y', labelcolor='red')
#     # Set the y-scale for bug issues to display integers only
#     ax3.yaxis.get_major_locator().set_params(integer=True)

#     # Add title and legend
#     fig.suptitle('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
#     fig.tight_layout()
#     fig.legend(loc='upper left', bbox_to_anchor=(0.1, 1))
#     plt.show()

# # Define the time interval for the spike
# start_date = datetime(2019, 7, 16)
# end_date = datetime(2019, 8, 31)

# # Filter commits and bug issues
# filtered_commit_info = filter_initial_outliers(commit_info)
# bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]

# # Plot commit metrics with bug issues
# plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3)
# plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3, log_scale=True)

In [None]:
number_of_commits_to_filter_out = 5

In [None]:
# Extract relevant data
def extract_commit_data_from_prs(pull_requests_data):
    commit_info = []
    for pr in pull_requests_data:
        for commit in pr['commits']:
            commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            # Calculate the changes as the sum of additions and deletions
            changes = pr['additions'] + pr['deletions']
            commit_info.append((commit_date, changes))
    return commit_info

commit_info = extract_commit_data_from_prs(pull_requests_data)

# Filter out the first number_of_commits_to_filter_out commits
def filter_initial_commits(commit_info, n):
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df_commits.sort_values(by='commit_date', inplace=True)
    df_filtered = df_commits.iloc[n:]
    return df_filtered

filtered_commit_info = filter_initial_commits(commit_info, number_of_commits_to_filter_out)

def plot_commit_metrics_with_bugs(commit_info, bug_issues, months_per_label=3, log_scale=False):
    # Convert commit info to DataFrame
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])
    
    # Resample data by month
    df_commits.set_index('commit_date', inplace=True)
    df_commits_monthly = df_commits.resample('M').agg({'changes': ['mean', 'count']})
    df_commits_monthly.columns = ['avg_size', 'monthly_commits']
    
    # Fill NaN values with 0
    df_commits_monthly.fillna(0, inplace=True)
    
    # Group bug issues by month
    df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
    df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
    df_bugs.set_index('created_at', inplace=True)
    df_bugs_monthly = df_bugs.resample('M').size()
    
    fig, ax1 = plt.subplots(figsize=(18, 8))

    # Bar plot for monthly commits
    ax1.bar(df_commits_monthly.index, df_commits_monthly['monthly_commits'], width=20, alpha=0.6, label='Monthly Commits', color='green')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    ax1.grid(True)

    # Secondary axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(df_commits_monthly.index, df_commits_monthly['avg_size'], marker='o', linestyle='-', color='blue', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='blue')
    if log_scale: ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='blue')

    # Third axis for bug issues
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.plot(df_bugs_monthly.index, df_bugs_monthly.values, 'r-', marker='x', label='Bug Issues')
    ax3.set_ylabel('Bug Issues', color='red')
    ax3.tick_params(axis='y', labelcolor='red')
    # Set the y-scale for bug issues to display integers only
    ax3.yaxis.get_major_locator().set_params(integer=True)

    # Add title and legend
    fig.suptitle('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 1))
    plt.show()

# Define bug issues
bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]

# Plot commit metrics with bug issues
plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3)
plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3, log_scale=True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

# Extract relevant data
def extract_commit_data_from_prs(pull_requests_data):
    commit_info = []
    for pr in pull_requests_data:
        for commit in pr['commits']:
            commit_date = datetime.strptime(commit['date'], "%Y-%m-%dT%H:%M:%S")
            # Calculate the changes as the sum of additions and deletions
            changes = pr['additions'] + pr['deletions']
            commit_info.append((commit_date, changes))
    return commit_info

commit_info = extract_commit_data_from_prs(pull_requests_data)

# Filter out the first number_of_commits_to_filter_out commits
def filter_initial_commits(commit_info, n):
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df_commits.sort_values(by='commit_date', inplace=True)
    df_filtered = df_commits.iloc[n:]
    return df_filtered

filtered_commit_info = filter_initial_commits(commit_info, number_of_commits_to_filter_out)

def plot_commit_metrics_with_bugs(commit_info, bug_issues, months_per_label=3, log_scale=False, export_format=EXPORT_FORMAT):
    # Convert commit info to DataFrame
    df_commits = pd.DataFrame(commit_info, columns=['commit_date', 'changes'])
    df_commits['commit_date'] = pd.to_datetime(df_commits['commit_date'])
    
    # Resample data by month
    df_commits.set_index('commit_date', inplace=True)
    df_commits_monthly = df_commits.resample('M').agg({'changes': ['mean', 'count']})
    df_commits_monthly.columns = ['avg_size', 'monthly_commits']
    df_commits_monthly.fillna(0, inplace=True)
    
    # Group bug issues by month
    df_bugs = pd.DataFrame(bug_issues, columns=['created_at', 'label'])
    df_bugs['created_at'] = pd.to_datetime(df_bugs['created_at'])
    df_bugs.set_index('created_at', inplace=True)
    df_bugs_monthly = df_bugs.resample('M').size().to_frame(name='bug_issues')

    # Export the data for dashboard use
    export_df(df_commits_monthly, f'commit_metrics', export_format) 
    export_df(df_bugs_monthly, f'bug_metrics', export_format)

    # Plotting the data (for visualization, if needed)
    fig, ax1 = plt.subplots(figsize=(18, 8))

    # Bar plot for monthly commits
    ax1.bar(df_commits_monthly.index, df_commits_monthly['monthly_commits'], width=20, alpha=0.6, label='Monthly Commits', color='green')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Number of Commits', color='green')
    ax1.tick_params(axis='y', labelcolor='green')
    ax1.grid(True)

    # Secondary axis for average commit size
    ax2 = ax1.twinx()
    ax2.plot(df_commits_monthly.index, df_commits_monthly['avg_size'], marker='o', linestyle='-', color='blue', label='Average Commit Size')
    ax2.set_ylabel('Average Commit Size (lines)', color='blue')
    if log_scale:
        ax2.set_yscale('log')
    ax2.tick_params(axis='y', labelcolor='blue')

    # Third axis for bug issues
    ax3 = ax1.twinx()
    ax3.spines['right'].set_position(('outward', 60))
    ax3.plot(df_bugs_monthly.index, df_bugs_monthly['bug_issues'], 'r-', marker='x', label='Bug Issues')
    ax3.set_ylabel('Bug Issues', color='red')
    ax3.tick_params(axis='y', labelcolor='red')
    ax3.yaxis.get_major_locator().set_params(integer=True)

    # Add title and legend
    fig.suptitle('Commit Frequency, Average Commit Size, and Bug Issues Over Time')
    fig.tight_layout()
    fig.legend(loc='upper left', bbox_to_anchor=(0.1, 1))

    if EXPORT_PNGS:
        if not log_scale: plt.savefig("commit_freq__avg_commit_size__bug_issues.png")
        else: plt.savefig("commit_freq__avg_commit_size__bug_issues_log.png")
    plt.show()

# Define bug issues
bug_issues = [issue for issue in issues_data if 'bug' in issue['labels']]

# Plot and export commit metrics with bug issues
plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3, export_format=EXPORT_FORMAT)
plot_commit_metrics_with_bugs(filtered_commit_info, bug_issues, months_per_label=3, log_scale=True, export_format=EXPORT_FORMAT)