In [25]:
#from pathlib import Path
import pandas as pd

import panel as pn
pn.extension('tabulator')
import hvplot.pandas


In [26]:
## Local data storage
file_name = "repository_data_final.csv"
folder_name = "data_1.0/"
#data_file = Path.cwd() / ".." / folder_name / file_name
#df = pd.read_csv(data_file)

In [27]:
# web based data storage
#df = pd.read_csv("https://sebastianzug.github.io/TUBAF-IFI-DiPiT/Company_data_set/repository_data_final.csv?raw=true")
#df = pd.read_csv("https://github.com/TUBAF-IFI-DiPiT/Company_data_set/blob/main/repository_data_final.csv?raw=true")
df = pd.read_csv("https://raw.githubusercontent.com/TUBAF-IFI-DiPiT/Company_data_set/main/repository_data_final.csv")

#df = pd.read_csv("https://raw.githubusercontent.com/TUBAF-IFI-DiPiT/Company_data_set/gh_pages/repository_data_final.csv")


  df = pd.read_csv("https://raw.githubusercontent.com/TUBAF-IFI-DiPiT/Company_data_set/main/repository_data_final.csv")


In [28]:
df_summary = df.copy()

max_contributers = 10
df_summary.loc[(df_summary.weekly_cc_max > max_contributers),'weekly_cc_max'] = max_contributers + 1

max_duration = 10
df_summary.loc[(df_summary.effective_weeks > max_duration),'effective_weeks'] = max_duration + 1
df_summary['relevant'] = False

df_summary.loc[(df_summary.weekly_cc_max <= max_contributers) & 
       (df_summary.effective_weeks <= max_duration) &
       (df_summary.commit_greater1_exist == True), "relevant"] = True

df_summary = df_summary[df_summary.relevant]

## Preparation

In [29]:
if 'data' not in pn.state.cache.keys():
    pn.state.cache['data'] = df_summary.copy()
else: 
    df_summary = pn.state.cache['data']
    
# Make DataFrame Pipeline Interactive
idf = df_summary.interactive()

## Part 1: Diagramm

In [30]:
param = df_summary.organization_name.unique().tolist()

select_org = pn.widgets.Select(
    name='organization_name',
    value="Microsoft",
    options=param
)

In [31]:
data_pipeline_selectcomp_basic = (
        idf[idf.organization_name == select_org]\
           .pivot_table(values='repo_name', index='weekly_cc_max', 
                        columns='effective_weeks', 
                        aggfunc='count')\
           .unstack()\
           .reset_index()\
           .rename(columns={0: "count"})
)

data_pipeline_selectcomp_basic.head()

BokehModel(combine_events=True, render_bundle={'docs_json': {'be9b40c7-4d61-4f06-9cb3-c01295bbdce2': {'version…

In [32]:
heatmap = data_pipeline_selectcomp_basic.hvplot.heatmap(x='weekly_cc_max', y='effective_weeks', C='count', 
                                            xlim=(0, max_contributers+1), ylim =(0, max_duration+1),
                                            title='Number of relevant repositories (effective contributors < 10 and effective duration < 10 weeks) in data set', 
                                            )

heatmap

BokehModel(combine_events=True, render_bundle={'docs_json': {'068e2f14-6caf-4dde-a40d-d73e0065c5ca': {'version…

## Part 2: Parameter diagram

In [33]:
param = ['stars',
       'size_kB', 'contributor_count', 'branch_count', 'commit_count',
       'commit_comment_count', 'last_commit_date', 'labels_count', 'tag_count',
       'milestone_count', 'pullrequest_count', 'pullrequest_review_count',
       'release_count', 'workflow_count', 'readme_length', 'issues_count',
       'issues_comment_count', 'watchers_count', 'project_duration_days', 
       'project_duration_weeks', 
       'creation_date_year', 'subscribers_count', 'forks_count',
       'effective_weeks', 'weekly_cc_max', 'weekly_cc_mean', 'weekly_cc_std',
       'weekly_cc_mean_normalized']

In [34]:
data_pipeline_selectcomp = (
     idf[idf.organization_name == select_org]\
        .groupby("organization_name")\
        .agg(
              relevant_repositories=('contributor_count', 'count'), 
              branches_exist_in =('branch_exist', 'sum'),
              issues_exist_in =('issues_exist', 'sum'),
              pr_exist_in =('pr_exist', 'sum'),
              issues_commment_exist_in =('issues_commment_exist', 'sum'),
              pr_review_exist_in =('pr_review_exist', 'sum'),
        )\
        .transpose()
)

In [35]:
company_table = data_pipeline_selectcomp.pipe(pn.widgets.Tabulator) 

company_table

BokehModel(combine_events=True, render_bundle={'docs_json': {'a48757b6-a4c0-4e85-a732-ac5a6a1d5b35': {'version…

In [36]:
data_pipeline_selectcomp_all = (
     idf[idf.organization_name == select_org]
)

In [37]:
company_table_all = data_pipeline_selectcomp_all.pipe(pn.widgets.Tabulator) 

## Generate Dashboard

In [38]:
template = pn.template.FastListTemplate(
    title = "DiP-iT Dataset",
    sidebar =[pn.pane.Markdown("# Abstract"),
              pn.pane.Markdown("This page illustrates the usage of DiP-iT data set covering 17000 repositories of industrial Github repositories. The collection includes project parameters of 17 companies and was generated in 2021."),
              pn.pane.Markdown("An overview about the contained parameters is provided [here](). We used the [github2pandas Package]() for generating the data set."),
              pn.pane.Markdown("# Company selection"),
              pn.pane.Markdown("The dashboard filters the repositories and depicts the distributions of contributors and duration for smaller projects."),
              select_org,
              pn.pane.Markdown("# Data set"),
              pn.pane.Markdown("The whole data set can be downloaded [here]()"),
             ],
    main=[pn.Row
            (
            pn.Column(
               heatmap.panel(width=600, height=500, margin=(0, 100, 0, 20))
               ),
            pn.Column(
               company_table.panel(width=1000)
               )
            ),
            pn.Row
            (
               company_table_all.panel(width=1000, height=500)
            )  
        ]
)

#template.show()
template.servable()