We just need the <tt>log</tt> files of the first level:

In [None]:
import glob
ROOT_DIR = r"C:/Temp/build_logs"
GLOB_PATTERN = "*/*/log"
log_file_paths = glob.glob(ROOT_DIR + "/" + GLOB_PATTERN)
log_file_paths[:5]

We clean up these ugly, different, OS specific file separators by using the common one. (Note: We could have also used <tt>os.sep</tt>, but if you extract e. g. by a regex, it's getting unreadable.)

In [None]:
log_file_paths = [path.replace("\\", "/") for path in log_file_paths]
log_file_paths[:5]

In [None]:
import pandas as pd
logs = pd.DataFrame(log_file_paths, columns=['path'])
logs.head()

In [None]:
logs.info()

From the path, we can extract the name of t he Jenkins job as well as the build number of the executed job.

In [None]:
logs = logs.join(logs['path'].str.extract(r"^.*/(?P<jobname>.*)/(?P<buildnumber>.*)/log$", expand=True))
logs.head()

Dateien aus Ordner in Liste

In [None]:
from collections import deque

def load_content(file_path):
    lines = []
    with open(file_path, mode='r', encoding="utf-8") as f:
        #lines = reversed(deque(f, 200))
        lines = deque(f, 200)
        
    return "".join(lines)

In [None]:
def load_content_via_csv(file_path):
    series = pd.read_csv(file_path, sep="\u0012", header=None, encoding="utf-8")[0]
    return "\n".join(series[-200:].values)

This could take some time to execute.

In [None]:
logs['content'] = logs['path'].apply(load_content)
logs.head()

In [None]:
logs['finished'] = logs['content'].str.extract(r"Finished: (.*)\n", expand=False)
print(str(len(logs[~logs['finished'].isnull()])) + "/" + str(len(logs)) + ' identified by "Finished:" marker.')
logs.head()

Auflistung der nicht zuordbaren Builds (kann z. B. durch das Abbrechen des Kopiervorgangs durch das gleichzeitige schreiben der Datei verursacht worden sein).

In [None]:
logs.ix[logs['finished'].isnull(), 'finished'] = "UNKNOWN"
assert len(logs[logs['finished'].isnull()]) == 0, "Non treated pattern for failures."

# Identifcation of errors

In [None]:
del(logs['error'])

## Mark successful executions

In [None]:
successfull_executions = logs['finished'] == "SUCCESS"
number_of_successfull_executions = len(logs[successfull_executions])
logs.ix[successfull_executions, 'error'] = "none"
print(str(number_of_successfull_executions) + "/" + str(len(logs)) + ' builds identified as successfull.')

## General
First, try to extract the message behind the first <tt>ERROR</tt> marker.

In [None]:
non_successful = logs['error'].isnull()
non_successful_logs = logs[non_successful]
error_state = non_successful_logs['content'].str.extract(r"\n\[?ERROR\]?.*? (.*)\n", expand=False)
logs.ix[non_successful, 'error'] = error_state
print(str(len(non_successful_logs)) + "/" + str(len(logs)) + ' builds identified by "ERROR" markers.')

## SonarQube errors

In [None]:
SONAR_ERROR_MARKER = "SonarQube analysis completed: FAILURE"
sonar_errors = (logs['error'].isnull()) & (logs['content'].str.contains(SONAR_ERROR_MARKER))
number_of_sonar_errors = len(logs[sonar_errors])
logs.ix[sonar_errors, 'error'] = SONAR_ERROR_MARKER
print(str(number_of_sonar_errors) + "/" + str(len(logs)) + ' errors identified by "SonarQube" markers.')
logs.head()

## Failed execution of Maven

Overrides already existing errors.

In [None]:
MAVEN_BUILD_FAILURE = "\[INFO\] BUILD FAILURE"
FAILED_GOAL_MARKER = "\[ERROR\] Failed to execute goal"
fail_maven_execution = (logs['content'].str.contains(MAVEN_BUILD_FAILURE)) & (logs['content'].str.contains(FAILED_GOAL_MARKER))
print("Overriding existing, wrongly marked entries: " + str(len(logs[fail_maven_goals & (~logs['error'].isnull())])))
fail_maven_goals_logs = logs[fail_maven_execution]
fail_goals = fail_maven_goals_logs['content'].str.extract(".*(" + FAILED_GOAL_MARKER + " .*?) ", expand=False)
logs.ix[fail_maven_execution, 'finished'] = "BUILD_FAILURE"
logs.ix[fail_maven_execution, 'error'] = fail_goals
print(str(len(fail_maven_goals_logs)) + "/" + str(len(logs)) + ' builds identified as fails goals.')
fail_maven_goals_logs.head()

In [None]:
logs.ix[logs['error'].isnull(), 'error'] = "UNTREATED"
print("Untreated cases: " + str(len(logs[logs['error'] == "UNTREATED"])))

In [None]:
build_breaker = logs.groupby(['finished', 'error']).count()[['path']]
build_breaker = build_breaker.rename(columns = { 'path' : 'count'})
build_breaker.to_excel("build_breaker.xlsx")
build_breaker

# Determine the reasons behind the errors

## tbd

# Additional statistics about test executions

Parsing <tt>Test Data: run, failures, errors, skipped</tt>

In [None]:
logs = logs.join(logs['content'].str.extract(r"\nTests run: (?P<test_run>[0-9]*), Failures: (?P<test_failures>[0-9]*), Errors: (?P<test_errors>[0-9]*), Skipped: (?P<test_skipped>[0-9])\n", expand=True))
logs.head()

Total time

In [None]:
logs['totaltime'] = logs['content'].str.extract(r"\n\[INFO\] Total time: ([0-9]*\.[0-9]*).*\n", expand=False)
logs['totaltime'] = logs['totaltime'].apply(pd.to_numeric)
logs['finish'] = logs['content'].str.extract(r"\n\[INFO\] Finished at: (.*)\n", expand=False)
logs.head()

In [None]:
%matplotlib inline
logs[logs['finished'] == "SUCCESS"].groupby('jobname').max()['totaltime'].dropna().plot(kind='bar')