# Introduction

Notebook that for select projects in certain languages will try and scrape build logs to find whether certain tools are used in the build flow of the projects. 

In [1]:
# Imports
%load_ext autoreload
%autoreload 2

from pymongo import MongoClient

import travis.scrape_build_logs as build_scraper

import analysis_utilities
import random
import re


In [2]:
mongo_client = MongoClient()

database = mongo_client["graduation"]
pull_requests_collection = database["pull_requests"]
projects_collection = database["projects"]
commits_collection = database["commits"]

## Project selection

We only select certain projects, as tool names differ per language / environment. For now we focus on Python

In [14]:
projects_list = list(projects_collection.find({'succeeded' : True, 'travis_is_oldest_ci': True, 'language': {'$in': ["C++", "Java", "Objective-C", "C#", "C", "Python", "PHP", "JavaScript", "Ruby"]}}))

print("{} known language projects".format(len(projects_list)))

for project in projects_list:
    split_slug = project["full_name"].split("/")
    
    prs = list(pull_requests_collection.find(
                { 
                    'project_name' : split_slug[1],
                    'project_owner': split_slug[0]
                }
            ))
    
    pr_before, pr_after = analysis_utilities.split_prs_on_build_date(project, prs, True, "status_travis_date")
    
    project["travis_prs"] = pr_after

571 known language projects


In [19]:
def process_project(project):
    
    if len(project["travis_prs"]) < 20:
        return None
    
    print(project["full_name"])

    sample_prs = random.sample(project["travis_prs"],  20)

    sample_commits = [sha for commits in [pr["commits"] for pr in sample_prs] for sha in commits]

    sample_commits = random.sample(sample_commits, 15)
    
    target_urls = []
    
    for sha in sample_commits:
        commit = commits_collection.find_one({'sha':sha, 'statuses' : {'$exists': True}})
        if commit is None:
            continue
            
        if "statuses" in commit:
            for status in commit["statuses"]:
                if "target_url" in status and status["target_url"] is not None:
                    if "builds" in status["target_url"] and \
                        "travis-ci.org" in status["target_url"]:
                        target_urls.append(status["target_url"])
    
            
    #target_urls = [status["target_url"] for statuses in [commits_collection.find_one({'sha':sha, 'statuses' : {'$exists': True}})["statuses"] for sha in sample_commits] for status in statuses if "target_url" in status and  "builds" in status["target_url"] and "travis-ci.org" in status["target_url"]]

    build_ids = list(set([build_scraper.retrieve_build_identifier_from_travis_url(url) for url in target_urls]))
    
    if len(build_ids) > 7:
        build_ids = random.sample(build_ids, 7)

    logs = build_scraper.build_logs_for_identifiers(build_ids)
    
    return process_logs(logs)

In [20]:
def process_logs(logs):
    has_linter = False
    has_coverage = False
    
    lint_lines = []
    coverage_lines = []
    
    for log in logs:
        lines = log.split("\n")
        
        for line in lines:
            if "lint" in line:
                has_linter = True
                lint_lines.append(line)
                
            if "codecov" in line or "coverage" in line or "Pytest-cov" in line or "coveralls" in line:
                has_coverage = True
                coverage_lines.append(line)
                
    
    return {'has_linter': has_linter, 
           'has_coverage': has_coverage}

In [None]:
for project in projects_list:
    res = process_project(project)
    
    project["ci_features"] = res
    
    projects_collection.update_one({'full_name': project["full_name"]}, {'$set': {'ci_features': res}})

Leaflet/Leaflet
MRtrix3/mrtrix3
MarkUsProject/Markus
NancyFx/Nancy
OP2/PyOP2
OPM/opm-core
OPM/opm-parser
RestKit/RestKit
OSU-Net/cyder
Smoothieware/Smoothieware
AFNetworking/AFNetworking
Theano/Theano
TracksApp/tracks
PCSX2/pcsx2
PX4/Firmware
ansible/ansible-modules-core
ansible/ansible-modules-extras
antlr/antlr4
Pylons/pyramid
GoldenCheetah/GoldenCheetah
AnalyticalGraphicsInc/cesium
Azure/azure-sdk-for-node
IQSS/dataverse
ImageEngine/cortex
ReactiveX/RxJava
TrinityCore/TrinityCore
apache/storm
appcelerator/alloy
Azure/azure-xplat-cli
Unidata/netcdf-c
Unidata/thredds
UniversalMediaServer/UniversalMediaServer
Vauxoo/addons-vauxoo
appcelerator/titanium_mobile
appleseedhq/appleseed
KSP-KOS/KOS
aquynh/capstone
MariaDB/server
bokeh/bokeh
boto/boto
astropy/astropy
Could not retrieve build log from Travis, status code is 404
Could not retrieve build log from Travis, status code is 404
Could not retrieve build log from Travis, status code is 404
Could not retrieve build log from Travis, statu