In [4]:
import requests
from datetime import datetime
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from pypi_simple import PyPISimple

### Core Idea

A good python project is most likely distributed via PyPI. Track how often packages are released.

In [2]:
test_data = [
    'https://github.com/PyCQA/pylint | 10',
    'https://github.com/PyCQA/flake8 | 10',
    'https://github.com/deepfakes/faceswap | 9',
    'https://github.com/cookiecutter/cookiecutter | 8',
    'https://github.com/zappa/zappa | 7',
    'https://github.com/box/box-python-sdk | 7',
    'https://github.com/box/flaky | 6',
    'https://github.com/zalando/python-nsenter | 6',
    'https://github.com/jazzband/django-pipeline | 5',
    'https://github.com/tiangolo/full-stack | 4',
    'https://github.com/miracle2k/flask-assets | 3',
    'https://github.com/tiangolo/docker-auto-labels | 2',
    'https://github.com/tiangolo/bitbucket_issues_to_redmine_csv | 1',
    'https://github.com/zalando-stups/senza | 1',
    'https://github.com/aizvorski/scikit-video | 1',
]

In [50]:
def get_release_date(tag):
    return datetime.strptime(tag.findAll('time')[0]['datetime'], '%Y-%m-%dT%H:%M:%S%z').replace(tzinfo=None)

def get_data(repo_name):
    
    if requests.get(f'https://pypi.org/project/{repo_name}').status_code != 200:
        return None
    
    page = requests.get(f'https://pypi.org/project/{repo_name}/#history')
    soup = BeautifulSoup(page.content, "html.parser")
    filter_pre_release = lambda tag: not tag.findAll('span', {'class': 'badge badge--warning release__version-badge'}) 
    tags = list(filter(filter_pre_release, soup.findAll('div', {'class' : "release"})))
    
    last_release_time  = get_release_date(tags[0])
    first_release_time = get_release_date(tags[-1])
    releases_amount = len(tags)
    
    release_intervals = []
    for i, tag in enumerate(tags):
        if i == 0:
            previous_release_date = get_release_date(tag)
            continue
        current_release_date = get_release_date(tag)
        release_intervals.append(previous_release_date - current_release_date)
        previous_release_date = current_release_date
    
    release_intervals = list((filter(lambda time: time.days != 0, release_intervals)))
    
    if (len(release_intervals) <= 1):
        print('-' * 15)
        print('REPO_NAME:', repo_name)
        print('None')
        return None
    average_release_time = 0
    for elem in release_intervals:
        average_release_time += elem.days
    average_release_time = int(average_release_time / len(release_intervals))
    
    release_intervals.sort()
    median_release_time = release_intervals[int(len(release_intervals) / 2)].days
    
    print('-' * 15)
    print('REPO_NAME:', repo_name)
    print('first_release_time:', first_release_time) 
    print('last_release_time:', last_release_time)
    print('releases_amount', releases_amount)
    print('average_release_time', average_release_time)
    print('median_release_time', median_release_time)
        
    return (first_release_time,
            last_release_time,
            releases_amount,
            average_release_time,
            median_release_time)

exp_results = [get_data(datum.replace(' ', '').split('|')[0].split('/')[-1]) for datum in test_data]
my_eval = [int(datum.replace(' ', '').split('|')[1]) for datum in test_data]

---------------
REPO_NAME: pylint
first_release_time: 2004-06-22 10:57:08
last_release_time: 2021-11-25 16:06:17
releases_amount 117
average_release_time 64
median_release_time 48
---------------
REPO_NAME: flake8
first_release_time: 2010-08-12 13:36:02
last_release_time: 2021-10-11 12:42:47
releases_amount 74
average_release_time 68
median_release_time 42
---------------
REPO_NAME: cookiecutter
first_release_time: 2013-07-14 19:23:10
last_release_time: 2021-05-14 09:48:46
releases_amount 30
average_release_time 118
median_release_time 73
---------------
REPO_NAME: zappa
first_release_time: 2016-01-20 21:59:36
last_release_time: 2021-11-11 20:31:33
releases_amount 128
average_release_time 23
median_release_time 6
---------------
REPO_NAME: box-python-sdk
None
---------------
REPO_NAME: flaky
first_release_time: 2014-04-04 03:09:29
last_release_time: 2020-07-08 21:32:31
releases_amount 36
average_release_time 71
median_release_time 28
---------------
REPO_NAME: django-pipeline
first_rel

In [45]:
## useful library for pip package extraction
with PyPISimple() as client:
    it = client.stream_project_names()
    print(client.get_project_url('fastapi'))

# link repo extraction
link = 'https://pypi.org/project/fastapi/'
soup = BeautifulSoup(requests.get(link).content, "html.parser")
tags = soup.findAll('a', {'class' : "vertical-tabs__tab vertical-tabs__tab--with-icon vertical-tabs__tab--condensed"})
for tag in tags:
    if tag.getText().split(None)[0] == 'Homepage':
        print(tag['href'])

https://pypi.org/simple/fastapi/
https://github.com/tiangolo/fastapi
https://github.com/tiangolo/fastapi
