# Comprehensive aggregation of GitHub activities

-----------------------------------------------------------------

This examples shows how to aggregate Issue, Pull-Request and Workflow activities as well as complete version history. 

In [5]:
from github2pandas.aggregation.issues import AggIssues
from github2pandas.aggregation.pull_requests import AggPullRequest
from github2pandas.aggregation.version import AggVersion
from github2pandas.aggregation.workflows import AggWorkflow
from github2pandas.aggregation.utility import Utility
from pathlib import Path
import threading
import time

## Basic Usage

The most important input parameter is an Repository object from PyGitHub-Package.

In [2]:
git_repo_name = "Extract_Git_Activities"
git_repo_owner = "TUBAF-IFI-DiPiT"
    
default_data_folder = Path("data", git_repo_name)

import os
github_token = os.environ['TOKEN']
# If you do not include your Github Token in .env, its neccessary to integrate it here. 
# github_token = "yourToken"

repo = Utility.get_repo(git_repo_owner, git_repo_name, github_token)

## Aggregation

In [7]:
print("Issues")
start_time = time.time()
AggIssues.generate_issue_pandas_tables(repo, default_data_folder)
print("Time: %.3fs" % (time.time() - start_time))

print("Pull Requests")
start_time = time.time()
AggPullRequest.generate_pull_request_pandas_tables(repo, default_data_folder)
print("Time: %.3fs" % (time.time() - start_time))

print("Version")
start_time = time.time()
AggVersion.clone_repository(repo=repo, data_root_dir=default_data_folder, github_token=github_token)
AggVersion.generate_version_pandas_tables(data_root_dir=default_data_folder)
print("Time: %.3fs" % (time.time() - start_time))

print("Workflows")
start_time = time.time()
AggWorkflow.generate_workflow_pandas_tables(repo=repo, data_root_dir=default_data_folder)
print("Time: %.3fs" % (time.time() - start_time))

Issues
Time: 37.821s
Pull Requests
Time: 71.261s
Version
Serial:   1%|          | 1/117 [00:00<00:11,  9.94it/s]   bug-fix/tests 
   documentation 
   feat/commit 
   notebook 
   workflows 
--------------------------------------
1
--------------------------------------
Found no database on provided path. Starting from scratch.
Serial: 100%|██████████| 117/117 [00:37<00:00,  3.11it/s]
Time: 38.353s
Workflows
Time: 20.407s


## Aggregation with mutiple threads

In [8]:
start_time = time.time()
print("Issues")
issues = threading.Thread(target=AggIssues.generate_issue_pandas_tables, args=(repo, default_data_folder))
issues.start()

print("Pull Requests")
pull_request = threading.Thread(target=AggPullRequest.generate_pull_request_pandas_tables, args=(repo, default_data_folder))
pull_request.start()

print("Version")
def agg_version(repo, data_root_dir, github_token):
    AggVersion.clone_repository(repo=repo, data_root_dir=default_data_folder, github_token=github_token)
    AggVersion.generate_version_pandas_tables(data_root_dir=default_data_folder)
version = threading.Thread(target=agg_version, args=(repo, default_data_folder, github_token))
version.start()

print("Workflows")
workflow = threading.Thread(target=AggWorkflow.generate_workflow_pandas_tables, args=(repo, default_data_folder))
workflow.start()

issues.join()
pull_request.join()
version.join()
workflow.join()

print("\nTime: %.3fs" % (time.time() - start_time))

Issues
Pull Requests
Version
Workflows
Serial:   0%|          | 0/117 [00:00<?, ?it/s]   bug-fix/tests 
   documentation 
   feat/commit 
   notebook 
   workflows 
--------------------------------------
1
--------------------------------------
Found no database on provided path. Starting from scratch.
Serial: 100%|██████████| 117/117 [00:40<00:00,  2.92it/s]

Time: 70.730s
