# Analysis of repository collection

In this notebook, the retrieved results of the repository collection is analyzed.

In [None]:
from datetime import date
import os
import statistics
from pathlib import Path
from pprint import pprint

import pandas as pd
from pandas.plotting import table 
import matplotlib.pyplot as plt
import matplotlib.table as table
import seaborn as sns

In [None]:
plt.rcParams['figure.figsize'] = [12, 8]

Folders and constants

In [None]:
subset = "all"  # "user", "group", "all"

# file paths
fp_repos = Path("results", "repositories_filtered.csv")
fp_figs = Path("figs")

# create output folder if not exists
fp_figs.mkdir(exist_ok=True)

## Load filtered repositories

Load (manually) filtered repositories into notebook.

In [None]:
repos = pd.read_csv(fp_repos)

## Filtering for research groups or private users only

In [None]:
users = pd.read_excel("../collect_users/results/users_enriched.xlsx")

In [None]:
# TODO: replace by join?
def add_research_group_filter(dataset):
    user_is_research_group = []
    for row in dataset.iterrows():
        try:
            owner = row[1]['owner']
        except:
            owner = row[1]['html_url_repository'].split('/')[-2]
            
            
      
        row_users = users.loc[users['user_id'] == owner]
        if len(row_users['is_research_group']) == 0:
            user_is_research_group.append(False)
        for i in row_users['is_research_group']:
            if i == 1:
                user_is_research_group.append(True)
                break
            else:
                user_is_research_group.append(False)
                break
                    
    return user_is_research_group
    

In [None]:
repos['is_research_group'] = add_research_group_filter(repos)

## Subset repositories

Make a subset of repositories on "user", "group", or "all".
- "user" - Account of individual researcher
- "group" - Account of research group
- "all" - Both individual researcher or research group

In [None]:
if subset == "user":
    repos = repos[~repos['is_research_group']]
elif subset == "group":
    repos = repos[repos['is_research_group']]

## Datasets

Using the github API, metadata on the repositories was collected. 

In [None]:
print("The repository data from Github contains the following columns:")
pprint(repos.columns.tolist())

## Stargazers

Stargazers represent the amount of people that have 'starred' a Github project. Starring a project can indicate that a user likes the project. It can also be used to bookmark a project, since starred projects are saved. The amount of stargazers can be used as a metric to measure popularity. 

In [None]:
repos['stargazers_count'].plot(kind = 'hist', fontsize = 12, title = 'Number of stargazers per repository', bins = 100)
plt.savefig(Path(fp_figs, f'stargazers_{subset}.png'))


In [None]:
repos.nlargest(10, 'stargazers_count')

## Watchers

Watchers receive  notifications about project activity.

In [None]:
repos['watchers_count'].plot.hist(fontsize = 12, title = 'Number of watchers per repository', bins = 128)
plt.savefig(Path(fp_figs, f'watchers_{subset}.png'))

In [None]:
repos.nlargest(10, 'watchers_count')

## Has issues

Boolean stating whether a repository allows users to address issues. An issue is way to keep track of the tasks, enchantments and bugs of the project. They can be discussed in a thread by users and developers. Each repository can enable their own issue page. An issue can be open, for example when a new bug is found, or closed, when it is solved. 

In [None]:
repos['has_issues'].value_counts().plot.bar(fontsize = 12, title = 'Repository has issues')
plt.savefig(Path(fp_figs, f'has_issues_{subset}.png'))

## Open issues

The amount of open issues a repository has

In [None]:
repos['open_issues'].value_counts().sort_index().plot.bar(fontsize = 12, title = 'Repository has open issues')
plt.savefig(Path(fp_figs, f'open_issues_{subset}.png'))

## Fork count

A fork is a copy of a repository for another user.

In [None]:
repos['forks'].plot.hist(fontsize = 12, title = 'Number of forks', bins = 25)
plt.savefig(Path(fp_figs, f'forks_{subset}.png'))

## Default branch

The default branch is the standard branch of a repository.

In [None]:
repos['default_branch'].value_counts().plot.bar(fontsize = 12, title = 'barplot default branch ')
plt.savefig(Path(fp_figs, f'default_branch_{subset}.png'))

## Homepage

The homepage is often the personal website of the research group or user. 

In [None]:
repos['homepage'].value_counts().nlargest(10).plot.bar(fontsize = 12, title = 'barplot 10 most occuring homepages ')
plt.savefig(Path(fp_figs, f'homepage_{subset}.png'))

## Most recent commits 

If a repository is really active, we expect them to be more FAIR. As stated, a higher FAIR score implies a repository that is easier to find and work with.

In [None]:
months_ago_most_recent_commit =[]
today = str(date.today())
split_today = today.split('-')
for date_item in repos['updated_at']:
    split_date = date_item.split('-')
    years_ago = int(split_today[0]) - int(split_date[0])
    months_ago = 12 * years_ago + (int(split_today[1]) - int(split_date[1]))
    months_ago_most_recent_commit.append(months_ago)
repos['months_ago_most_recent_commit'] = months_ago_most_recent_commit

In [None]:
repos['months_ago_most_recent_commit'].plot(kind = 'hist', fontsize = 12, title = 'histogram for amount of months since the last commit') #Add x-axis months ago
plt.savefig(Path(fp_figs, f'most_recent_commit_months_{subset}.png'))

In [None]:
print('mean contributors: {:.1f}'.format(statistics.mean(repos['months_ago_most_recent_commit'])))
print('median contributors: {}'.format(statistics.median(repos['months_ago_most_recent_commit'])))
