constraints  
- age < 1000 days
- Language JavaScript, TypeScript, CoffeeScript

In [83]:
import os
import requests, json, re, pickle
import pandas as pd
import networkx as nx
from datetime import datetime as dt
import matplotlib.pyplot as plt
import itertools
import functools
from collections import Counter

In [91]:
API_ROOT = 'https://api.github.com'
API_TOKEN = '6e7fe2b9d8c6da31a832de88eb69922e63c04f9b'
USER = 'Takamichi-tsutsumi'


class NoRepositoryError(Exception):
    pass


def str2dt(tstr):
    return dt.strptime(tstr, '%Y-%m-%dT%H:%M:%SZ')


def api_get(url):
    """
    @args: url
    $return: dict res, int status
    """
    global API_TOKEN, USER
    res = requests.get(url, auth=(USER, API_TOKEN))
    return res.json(), res.status_code


def rate_limit():
    global API_ROOT
    res, _ = api_get(API_ROOT + '/rate_limit')
    return res["resources"]["core"]["remaining"]


def get_repo(repo):
    """repo have to be shape of 'owner/name' """
    repo_pattern = re.compile('[a-zA-Z0-9-]+\/[a-zA-Z0-9-]+$')
    m = repo_pattern.match(repo)
    assert m.group() == repo
    global API_ROOT
    url = API_ROOT + '/repos/' + repo
    res, status  = api_get(url)
    if status == 404:
        raise NoRepositoryError("Repository not found: " + repo)
    return res


def basic_data_of(repo):
    repo_res = get_repo(repo)
    data = {}
    data["repository"] = repo_res["full_name"]
    data["owner_name"] = repo_res["owner"]["login"]
    data["owner_type"] = repo_res["owner"]["type"]
    data["firm_involvement"] = 0 if data["owner_type"] == "User" else 1
    data["is_fork"] = repo_res["fork"]
    data["size"] = repo_res["size"]
    data["language"] = repo_res["language"]
    data["forks"] = repo_res["forks"]
    data["watchers"] = repo_res["watchers"]
    data["created_at"] = repo_res["created_at"]
    data["pushed_at"] = repo_res["pushed_at"]
    data["age (days)"] = (str2dt(data["pushed_at"]) - str2dt(data["created_at"])).days
    return data


def get_contributors_of(repo, per_page=100, page=1):
    url = API_ROOT + '/repos/' + repo + '/contributors?page=' + str(page) + '&per_page=' + str(per_page)
    print("Start fetching contributors of "+repo)
    res, status = api_get(url)
    if status == 200:
        return res
    else:
        raise NoRepositoryError


def repo_url_to_repo(url):
    tokens = url.split("/")
    l = len(tokens)
    return tokens[l-2] + "/" + tokens[l-1]


def is_contributor_of(repo, user):
    n = 100
    i = 1
    all_contributors = []
    while n == 100:
        contributors = get_contributors_of(repo, per_page=100, page=i)
        if user in [c["login"] for c in contributors]:
            return True
        n = len(contributors)
        i += 1
    return False


def repos_user_contributed_to(user, repo):
    i = 1
    n = 100
    all_repos = []
    while n == 100:
        issues_url = API_ROOT + '/search/issues?q=type:pr+author:' + user + '&per_page=100/page=' + str(i)
        res, status = api_get(issues_url)

        if status != 200:
            return []
        else:
            repos = set([repo_url_to_repo(pr['repository_url']) for pr in res["items"]])
            if repo in repos:
                repos.remove(repo)
            all_repos.extend([r for r in list(repos) if is_contributor_of(r, user)])
        n = len(res)
        i += 1
    return all_repos


# Functions to implement graph
def has_connection(cx, cy):
    link = 0
    for i, j in itertools.product(cx["projects"], cy["projects"]):
        if i == j: link += 1;
    return link


def link_of(contributors):
    links = []
    for cx, cy in itertools.combinations(contributors, 2):
        if has_connection(cx, cy) != 0:
            links.append((cx["name"], cy["name"]))
    return links


def graph_of_contributors(cs):
    labels = [c["name"] for c in cs]
    links = link_of(cs)
    G = nx.Graph()
    G.add_nodes_from(labels)
    G.add_edges_from(links)
    return G


# Functions to calculate network characteristics
def cohesion_and_degree_centrality(users):
    count = 0
    for x, y in itertools.combinations(users, 2):
        print(x["name"], y["name"])
        for p in x["projects"]:
            if p in y["projects"]:
                print(p)
                count += 1
    print(count)
    return count / (len(users) * (len(users)-1))
    

def degree_centralities(graph):
    return graph.degree()

def contributors(repo):
    data = []
    n = 100
    i = 1
    while n == 100:
        contributors = get_contributors_of(repo, per_page=100, page=i)
        n = len(contributors)
        i += 1
        for c in contributors:
            user = {
                "id": c["id"],
                "name": c["login"],
                "contributions": c["contributions"]
            }
            projects = repos_user_contributed_to(c["login"], repo)
            user["projects"] = projects
            data.append(user)
    return data


def commit_count(users):
        return functools.reduce(lambda x, y: x + y["contributions"], users, 0)

    
def save_to_csv(data, filename):
    df = pd.DataFrame(data)
    df.to_csv(filename)


def save_contributors(contributors, repo_name):
    df = pd.DataFrame(contributors)
    df.to_csv(os.path.join('contributors', repo_name.replace('/', '_') + '.csv'))


def get_data(repo):
    data = {}
    print("Start fetching repository data...")
    data = basic_data_of(repo)
    print("Successfully fetched repository data!")
    print("Start fetching contributors data...")
    c = contributors(repo)
    save_contributors(c, repo)
    print("Successfully fetched contributors data!")
    data["commits"] = commit_count(c)
    print("Calculate internal cohesion")
    data["manager"] = c[0]["name"]
    data["internal cohesion"] = cohesion(c)
    graph = graph_of_contributors(c)
    data["degree centrality"] = degree_centralities(graph)[data["manager"]]
    return data

In [14]:
repos = ['Takamichi-tsutsumi/onocolo-client', 'mattn/goveralls']
repo_data = []
for r in repos:
    repo_data.append(get_data(r))

Start fetching repository data...
Successfully fetched repository data!
Start fetching contributors data...
Start fetching contributors of Takamichi-tsutsumi/onocolo-client
Start fetching contributors of Takamichi-tsutsumi/onocolo-client
Start fetching contributors of akotani/letel-web
Start fetching contributors of akotani/letel-rails
Start fetching contributors of Takamichi-tsutsumi/Meeeal
Start fetching contributors of akotani/letel-font
Start fetching contributors of HiroNonoyama/tsurusenishiguchi_dentalClinic
Start fetching contributors of Takamichi-tsutsumi/ChatStormingServerAPI
Start fetching contributors of Takamichi-tsutsumi/onocolo-client
Start fetching contributors of HiroNonoyama/SecretaryBot
Start fetching contributors of HiroNonoyama/tsurusenishiguchi_dentalClinic
Start fetching contributors of Takamichi-tsutsumi/Idobata.poli
Start fetching contributors of Takamichi-tsutsumi/hacker-wars
Start fetching contributors of Takamichi-tsutsumi/onocolo-client
Start fetching contri

In [16]:
save_to_csv(repo_data, 'repos.csv')

In [86]:
rate_limit()

4983

In [87]:
contributors_onocolo = get_data('Takamichi-tsutsumi/onocolo-client')

Start fetching repository data...
Successfully fetched repository data!
Start fetching contributors data...
Start fetching contributors of Takamichi-tsutsumi/onocolo-client
Start fetching contributors of akotani/letel-rails
Start fetching contributors of HiroNonoyama/tsurusenishiguchi_dentalClinic
Start fetching contributors of akotani/letel-font
Start fetching contributors of akotani/letel-web
Start fetching contributors of Takamichi-tsutsumi/Meeeal
Start fetching contributors of HiroNonoyama/tsurusenishiguchi_dentalClinic
Start fetching contributors of Takamichi-tsutsumi/ChatStormingServerAPI
Start fetching contributors of HiroNonoyama/SecretaryBot
Start fetching contributors of Takamichi-tsutsumi/Idobata.poli
Start fetching contributors of Takamichi-tsutsumi/hacker-wars
Start fetching contributors of HiroNonoyama/training
Start fetching contributors of Takamichi-tsutsumi/ChatStorming
Start fetching contributors of HiroNonoyama/tsurusenishiguchi_dentalClinic
Start fetching contributo

In [77]:
def search_repos():
    p = 1
    status = 200
    repos = []
    while 20 >= p and status == 200:
        url = 'https://api.github.com/search/repositories?q=language:javascript+created:>2014-01-01&sort=stars&order=desc&per_page=100&page' + str(p)
        res, status = api_get(url)
        if status == 200:
            repos.extend([r["full_name"] for r in res["items"]])
        p += 1
    return repos

In [78]:
repos = search_repos()

In [89]:
repos

['FreeCodeCamp/FreeCodeCamp',
 'facebook/react-native',
 'nodejs/node',
 'reactjs/redux',
 'NARKOZ/hacker-scripts',
 'Dogfalo/materialize',
 'callemall/material-ui',
 'nylas/N1',
 'yarnpkg/yarn',
 'babel/babel',
 'ReactTraining/react-router',
 'facebookincubator/create-react-app',
 'kenwheeler/slick',
 'facebook/immutable-js',
 'zenorocha/clipboard.js',
 'angular/material',
 't4t5/sweetalert',
 'bevacqua/dragula',
 'GitbookIO/gitbook',
 'serverless/serverless',
 'Kickball/awesome-selfhosted',
 'zeit/hyper',
 'facebook/flux',
 'ParsePlatform/parse-server',
 'kriasoft/react-starter-kit',
 'julianshapiro/velocity',
 'github/fetch',
 'jlmakes/scrollreveal',
 'mxstbr/react-boilerplate',
 'naptha/tesseract.js',
 'mzabriskie/axios',
 'RocketChat/Rocket.Chat',
 'juliangarnier/anime',
 'MostlyAdequate/mostly-adequate-guide',
 'segmentio/nightmare',
 'mattermost/platform',
 'git-tips/tips',
 'jwagner/smartcrop.js',
 'Flipboard/react-canvas',
 'ampproject/amphtml',
 'verekia/js-stack-from-scratch

In [82]:
def repos_data(repos):
    data = []
    for r in repos:
        data.append(get_data(r))
    return data

In [None]:
oss_data = repos_data(repos[:15])
save_to_csv(oss_data, 'oss_data.csv')

Start fetching repository data...
Successfully fetched repository data!
Start fetching contributors data...
Start fetching contributors of FreeCodeCamp/FreeCodeCamp
Start fetching contributors of FreeCodeCamp/hour-of-code
Start fetching contributors of FreeCodeCamp/design-style-guide
Start fetching contributors of QuincyLarson/hyperdev-test
Start fetching contributors of neveragaindottech/neveragaindottech.github.io
Start fetching contributors of neveragaindottech/neveragaindottech.github.io
Start fetching contributors of neveragaindottech/neveragaindottech.github.io
Start fetching contributors of neveragaindottech/neveragaindottech.github.io
Start fetching contributors of neveragaindottech/neveragaindottech.github.io
Start fetching contributors of realworldreact/realworldreact.com
Start fetching contributors of jescalan/rupture
Start fetching contributors of iansinnott/react-static-webpack-plugin
Start fetching contributors of BerkeleyTrue/berkeleys-spectacle-boilerplate
Start fetchin