In [24]:
from github2pandas.github2pandas import GitHub2Pandas
from github2pandas.workflows import Workflows
from github2pandas.version import Version
from pathlib import Path
import os
import logging
import pandas as pd

# Evaluate Workflow Data

-----------------------------------------------------------------
This example illustrates the aggregation of workflow (actions) data using the `github2pandas` repository. Here, all workflows are read out, filtered and displayed in terms of success or failure. 

In [25]:
git_repo_name = "github2pandas"
git_repo_owner = "TUBAF-IFI-DiPiT"

data_root_dir = Path("data")
repo_data_dir = Path(data_root_dir, git_repo_owner, git_repo_name)

In [26]:
workflows_path = Path.joinpath(repo_data_dir, Workflows.Files.DATA_DIR)
df_workflows = GitHub2Pandas.get_pandas_data_frame(workflows_path, Workflows.Files.WORKFLOWS)
df_runs = GitHub2Pandas.get_pandas_data_frame(workflows_path, Workflows.Files.RUNS)

In [27]:
df_workflows

Unnamed: 0,id,name,created_at,updated_at,state
0,6245620,RunTests,2021-02-28 17:31:08,2021-03-08 08:26:16,active


In [28]:
df_runs.head(5)

Unnamed: 0,workflow_id,id,commit_sha,pull_requests,state,event,conclusion,created_at,updated_at
0,6245620,1915206052,d414e0863345d65678b880272b158b9b9c6910d2,[],completed,push,success,2022-03-01 09:07:37,2022-03-01 09:15:29
1,6245620,1914808609,9dfdae56c778940adb39f8da5db7a3b2c88022d9,[],completed,push,success,2022-03-01 07:33:02,2022-03-01 07:39:42
2,6245620,1914733676,bdcc237df4230002948ae53d60e5722b0820cc7e,[],completed,push,failure,2022-03-01 07:13:30,2022-03-01 07:16:33
3,6245620,1914639513,92cb2fc7083fe23b6985ae8bd085ca370dc42334,[],completed,push,success,2022-03-01 06:48:16,2022-03-01 06:54:58
4,6245620,1910368483,e4f86f73e077bb27200a549b598ab29b227d9d16,[],completed,push,failure,2022-02-28 13:03:01,2022-02-28 13:11:27


## Examples
### Question 1: Do we generate more failures while working on weekend?

In [29]:
df_runs['day_of_week'] = df_runs['created_at'].dt.day_name()

results = df_runs.groupby(['day_of_week', 'conclusion']).commit_sha.count().unstack()
results['ratio'] = results['failure'] / (results['failure'] + results['success'])

results.sort_values(by=['ratio'])

conclusion,failure,success,ratio
day_of_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Friday,7.0,17.0,0.291667
Thursday,11.0,19.0,0.366667
Monday,18.0,26.0,0.409091
Tuesday,14.0,12.0,0.538462
Saturday,3.0,2.0,0.6
Wednesday,14.0,8.0,0.636364
Sunday,,10.0,


_Result: Best results on Fridays and no failures on Sundays! Impressive!_

### Question 2: What happens during the workflow run?

Let's take a closer view on log files of a specific failed Action run.

In [55]:
workflow_run_id = df_runs.iloc[2].id
df_runs.iloc[2]

workflow_id                                       6245620
id                                             1914733676
commit_sha       bdcc237df4230002948ae53d60e5722b0820cc7e
pull_requests                                          []
state                                           completed
event                                                push
conclusion                                        failure
created_at                            2022-03-01 07:13:30
updated_at                            2022-03-01 07:16:33
day_of_week                                       Tuesday
Name: 2, dtype: object

In [56]:
github_token = os.environ['GITHUB_API_TOKEN']
log_level = logging.DEBUG
github2pandas = GitHub2Pandas(github_token, data_root_dir, log_level=log_level)
repo = github2pandas.get_repo(git_repo_owner, git_repo_name)

In [57]:
Workflows.download_workflow_log_files(repo=repo,
                                  github_token=github_token,
                                  workflow_run_id=workflow_run_id,
                                  data_root_dir=repo_data_dir)

11

The workflow logs are stored in the data folder of the project now.

In [58]:
!ls ./data/TUBAF-IFI-DiPiT/github2pandas/Workflows/1914733676/extractData

'12_Post Run actionscheckout@v2.txt'  '3_Set up Python 3.8.txt'
'13_Complete job.txt'		      '4_Prepare python environment.txt'
'1_Set up job.txt'		      '5_Install dependencies.txt'
'2_Run actionscheckout@v2.txt'	      '6_Run tests.txt'


In [59]:
!tail -n 10 ./data/TUBAF-IFI-DiPiT/github2pandas/Workflows/1914733676/extractData/"6_Run tests.txt"

2022-03-01T07:16:28.4184099Z     for commit in tqdm(commits, desc='Serial'):
2022-03-01T07:16:28.4184476Z   File "/home/runner/work/github2pandas/github2pandas/src/github2pandas/version.py", line 306, in __init__
2022-03-01T07:16:28.4184819Z     return core.progress_bar(iterable, f"Version {desc}:")
2022-03-01T07:16:28.4185137Z NameError: name 'core' is not defined
2022-03-01T07:16:28.4185277Z 
2022-03-01T07:16:28.4185497Z ----------------------------------------------------------------------
2022-03-01T07:16:28.4185750Z Ran 19 tests in 82.935s
2022-03-01T07:16:28.4185875Z 
2022-03-01T07:16:28.4185967Z FAILED (errors=2, skipped=1)
2022-03-01T07:16:28.6086981Z ##[error]Process completed with exit code 1.


### Questions 3: Check who prepared the workflows

For this request we have to merge Version data with Workflow information. 

1. Prepare commit, edits and workflow dataframes
2. Extract commits adressing workflow-folder `.github/workflow/` from edits
3. Identify authors integrating workflows

In [11]:
versions_path = Path.joinpath(repo_data_dir, Version.Files.DATA_DIR)
df_edits = GitHub2Pandas.get_pandas_data_frame(versions_path, Version.Files.EDITS)
df_commits = GitHub2Pandas.get_pandas_data_frame(versions_path, Version.Files.COMMITS)

In [12]:
relevant_commits = df_edits[df_edits["new_path"].str.contains(".github/workflows/", na=False)][['commit_sha', 'filename']]
relevant_commits.drop_duplicates(inplace = True)
relevant_commits

Unnamed: 0,commit_sha,filename
0,540e6691a42db08b1dc94948b418fa1ea1b7380d,python_package.yml
160,cf2d4124846de842f24ae1deae56a07ab7ab007b,python_package.yml
359,daa70ee1d1cedc8d51e06bdf88ed655bbedc75bf,python_package.yml
371,1bda3b56508bc47073def8d58ed41163fd0b9dcd,python_package.yml
1054,3c43e5af6be062bada6a5e66b4f9503a7aa8a369,python_package.yml
1318,d69b074d24e4419534867ce75578952f1a8ab5d3,python_package.yml
1321,cc01c8a7ec6d3bb3c647acfb627d7a576cac9e77,python_package.yml
3709,b4fdab74ec4504b6caa7c702b86f1d790a1de17a,python_package.yml
6301,0d1e5ad84f4737bb716aa0b718f9b8a8fd1fabb3,python_package.yml
33504,2143785e1b7e2e1162f6406acb4706b00ef787b4,python_package.yml


In [13]:
pd.merge(relevant_commits, 
         df_commits[['author', 'commit_message', 'commit_sha', 'commited_at']],
         how="left", left_on = "commit_sha", right_on = "commit_sha")\
         [['author', 'commit_message', 'commited_at']]

Unnamed: 0,author,commit_message,commited_at
0,take-certain-word-end,Create python_package.yml,2021-02-28 17:31:08
1,take-certain-word-end,Add automated testing,2021-02-28 17:37:26
2,take-certain-word-end,Add github token variable to tests,2021-02-28 17:51:27
3,take-certain-word-end,Exclude private repositories from tests,2021-02-28 18:12:38
4,take-certain-word-end,Integrate commit processing,2021-02-28 20:00:33
5,take-certain-word-end,Update python_package.yml,2021-03-02 08:29:34
6,take-certain-word-end,Update python_package.yml,2021-03-02 08:31:39
7,take-certain-word-end,Update python_package.yml\n\nReplace old test ...,2021-03-08 08:26:15
8,love-go-past-name,edit tests\n\nadd bug-fix/tests to auto test run,2021-03-15 10:31:00
9,love-go-past-name,add v2.0.0 to workflow,2022-01-25 07:42:49
