# Sample reading collected result data

This notebook demonstrates reading build logs for failed builds
from the "pipfile success" runs,
which is one of the less expected results.

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

First, list the csv files for our runs:

In [9]:
ls data_collection

build_repo.csv                    repo2docker-checker_test1.csv
from_event_archive.csv            results.csv
import_error.csv                  success_default_requirements.csv
install_failed.csv                success_pipfile.csv
julia.csv                         success_with_dependencies.csv
r.csv                             success_without_dependencies.csv


Load the results into a data frame:

In [116]:
df = pd.read_csv("data_collection/success_pipfile.csv", sep=";", index_col="No")
df.head()

Unnamed: 0_level_0,Repo,repo2docker-checker results
No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,haishenming/uda-u5,1 ok
2,driscollis/NebraskaCode_Python_Decorators,"1 failure, TypeError"
3,kazup0n/lifegame,1 ok
4,onyxfish/nicar-2017-agate,Build failed
5,datadesk/python-road-clipping-example,1 failure. Cell execution timed out


Result data is stored in csv files based on the repo URL and timestamp of the run:

In [65]:
cat runs/github.com/a/agconti/dog-walks/results/results-master-2020-07-23T11.46.csv

repo,ref,resolved_ref,last_modified,kind,test_id,success,path,timestamp,run_id,repo2docker_version
https://github.com/agconti/dog-walks,master,8b6c692,2018-03-22T22:50:42-04:00,build,build,False,github.com/a/agconti/dog-walks/logs/build-master-2020-07-23T11.46.txt,2020-07-23T11:46:58.837968,2020-07-23T11.46,0.11.0


In [125]:
from textwrap import indent

def tail(path, max_lines=100, max_line_length=100, skip=0):
    """Ineficient tail with line truncation"""
    with open(path) as f:
        log_lines = f.readlines()[-(max_lines+skip):-skip or None]
    for line in log_lines:
        if len(line) > max_line_length:
            print(indent(line[:max_line_length-3] + "...", "  "))
            print(indent(line[-(max_line_length-3):], "  ..."))
        else:
            print(indent(line.rstrip(), "  "))

In [126]:
def results_for_repo(repo_slug):
    """Collect results for a given repo"""
    repo_slug = repo_slug.lower()
    repo_url = f"https://github.com/{repo_slug}"
    c = repo_slug.lower()[0]
    results_dir = os.path.join("runs", "github.com", repo_slug[0], repo_slug, "results")
    
    # collect data from csv files for runs
    csvs = sorted(glob.glob(os.path.join(results_dir, "*.csv")))
    frames = []
    for csv in csvs:
        frame = pd.read_csv(csv)
        if len(frame):
            frames.append(pd.read_csv(csv, parse_dates=True))
    if not frames:
        return
    return pd.concat(frames).reset_index(drop=True).sort_values("timestamp")


In [127]:
results_for_repo(df.Repo[9])

Unnamed: 0,repo,ref,resolved_ref,last_modified,kind,test_id,success,path,timestamp,run_id,repo2docker_version
0,https://github.com/samzhuwj/use-pyfpdf,master,049ad9b,2018-01-20T00:11:26+08:00,build,build,True,github.com/s/samzhuwj/use-pyfpdf/logs/build-ma...,2020-07-23T11:55:56.416858,2020-07-23T11.55,0.11.0
1,https://github.com/samzhuwj/use-pyfpdf,master,049ad9b,2018-01-20T00:11:26+08:00,notebook,use-pyfpdf.ipynb,False,github.com/s/samzhuwj/use-pyfpdf/logs/test-not...,2020-07-23T11:55:56.416858,2020-07-23T11.55,0.11.0


In [129]:
import glob
import os

for repo_slug in df.Repo:
    result_data = results_for_repo(repo_slug)
    if result_data is None:
        continue
    build_failures = result_data[(result_data.kind=="build") & (~result_data.success)]
    if len(build_failures):
        
        log_file = os.path.join("runs", build_failures.path.tail(1).item())
        print(f"\n\n\n{repo_slug} build log failure: {log_file}")
        tail(log_file, 20, max_line_length=128, skip=8)





onyxfish/nicar-2017-agate build log failure: runs/github.com/o/onyxfish/nicar-2017-agate/logs/build-master-2020-07-23T11.35.txt
    File "/srv/conda/envs/notebook/lib/python3.7/site-packages/pipenv/core.py", line 795, in do_install_dependencies
  [0m[91m    lockfile = project.get_or_create_lockfile(from_pipfile=True)
    File "/srv/conda/envs/notebook/lib/python3.7/site-packages/pipenv/project.py", line 756, in get_or_create_lockfile
  [0m[91m    path=self.lockfile_location, data=lockfile_dict, meta_from_project=False
    File "/srv/conda/envs/notebook/lib/python3.7/site-packages/pipenv/vendor/requirementslib/models/lockfile.py", line 209, in ...
  ...rv/conda/envs/notebook/lib/python3.7/site-packages/pipenv/vendor/requirementslib/models/lockfile.py", line 209, in from_data

  [0m[91m    lockfile = plette.lockfiles.Lockfile(data)
    File "/srv/conda/envs/notebook/lib/python3.7/site-packages/pipenv/vendor/plette/models/base.py", line 37, in __init__
      [0m[91mself.validat

In many cases, Pipfile and/or Pipfile.lock are pinning packages such as numpy
to versions that do not work with current Python 3.8, but Python itself is not pinned,
despite *suport* for specifying Python version in Pipfiles.