This notebook uses PyDriller 1.9.2 (https://pypi.org/project/PyDriller/) to mine file history from gitHub repos.

This notebook also calls a script ('affected.py') in a local copy of the oceanobservatories/preload database (https://github.com/oceanobservatories/preload-database).  Note, this script has dependancies on the ooi-data repo as well (https://github.com/oceanobservatories/ooi-data).



In [None]:
from pydriller import RepositoryMining
import re
from subprocess import Popen, PIPE
import os
from os import path

import pandas as pd
import numpy as np
import datetime
import math
import csv
import glob

The following parameters will need to be configured for MIO-specific definitions.  (Also note, some regEx strings in the code depend on a directory structure similar to '/Users/rsn/asset-management/' and a filename structure similar to "ATAPL-12345-00001__20190101.csv" and may need to be modified to match other local directory structures.)

localRepo: local directory for asset management gitHub repo

localDir: local directory for other repos

sensorList: list of MIO-specific sensors with cal files on gitHub

filePrefix: prefix of cal files

In [None]:
### Set up file pathways

localRepo = ('/Users/rsn/asset-management/')
localDir = ('/Users/rsn/')

sensorList = ['CTDPFA','CTDBPO','CTDBPN','CTDPFB','DOFSTA','FLCDRA','FLNTUA','SPKIRA', \
              'NUTNRA', 'OPTAAC', 'OPTAAD', 'PARADA', 'DOSTAD', 'FLORDD', 'PCO2WA', 'PCO2WB', \
              'PHSENA', 'PHSEND', 'THSPHA', 'TMPSFA', 'TRHPHA', 'VEL3DA', 'ZPLSCB']
filePrefix = 'AT'

In [None]:
def commitMergeDate(commitHash):
    ### return the merge date of a single commit given the commit hash
    
    mergeDate = -99999
    
    git_command = ['git', 'when-merged','-c', commitHash]
    repository  = path.dirname(localRepo) 

    git_query = Popen(git_command, cwd=repository, stdout=PIPE, stderr=PIPE)
    (git_status, error) = git_query.communicate()
    if git_status:
        mergeHash = re.search(r"([a-zA-Z0-9]{40})",str(git_status)).group(1)
        if mergeHash:
            for commit in RepositoryMining(localRepo, single=mergeHash).traverse_commits():
                mergeDate = commit.author_date.strftime("%Y%m%dT%H%M%S")
    elif 'Commit is directly on this branch' in str(error):
        mergeDate = 'directCommit'

    return mergeDate

In [None]:
def gitHubMine(path_to_repo,path_to_fileName, fileName):
    ### use the pyDriller 

    rename=[]
    gitHubHistory={}
    commitHash=[]
    mods = ['ADD','MODIFY','DELETE','COPY','RENAME']

    for commit in RepositoryMining(path_to_repo, filepath=path_to_fileName+fileName).traverse_commits():
        for mod in commit.modifications:
            if fileName in mod.filename:
                if any(modType in str(mod.change_type) for modType in mods):
                    commitHash.append(commit.hash)
                    mergeDate = commitMergeDate(commit.hash)
                    print(mergeDate)
                    if 'directCommit' in str(mergeDate):
                        mergeDate = commit.author_date.strftime("%Y%m%dT%H%M%S")
                    elif '-99999' in str(mergeDate):
                        mergeDate = 'unknown'
                    if mergeDate not in gitHubHistory:
                        gitHubHistory[mergeDate]={}  
                    gitHubHistory[mergeDate][commit.author_date.strftime("%Y%m%dT%H%M%S")]  = {'commit': commit.hash, 'modType': str(mod.change_type), 'fileName': mod.new_path, 'oldFileName': mod.old_path}    
                    if 'RENAME' in str(mod.change_type):
                        oldFileName = re.search(r"calibration\/.*\/(.*)", mod.old_path).group(1)
                        if oldFileName:
                            rename.append(oldFileName)
                    if 'MODIFY' in str(mod.change_type):
                        modCategory = list(categorizeCalModification(path_to_repo, commit.hash, mod.new_path))
                        if not modCategory:
                            modCategory = ['notClassified']
                        gitHubHistory[mergeDate][commit.author_date.strftime("%Y%m%dT%H%M%S")]['modCategory'] = modCategory
                    
    if len(rename) > 0:
        for renamedFile in rename:
            for commit in RepositoryMining(path_to_repo, filepath=path_to_fileName+fileName).traverse_commits():
                for mod in commit.modifications:
                    if renamedFile in mod.filename:
                        if any(modType in str(mod.change_type) for modType in mods):
                            if commit.hash not in commitHash:
                                commitHash.append(commit.hash)
                                mergeDate = commitMergeDate(commit.hash)
                                if 'directCommit' in mergeDate:
                                    mergeDate = commit.author_date.strftime("%Y%m%dT%H%M%S")
                                elif '-99999' in mergeDate:
                                    mergeDate = 'unknown'
                                if mergeDate not in gitHubHistory:
                                    gitHubHistory[mergeDate]={}
                                gitHubHistory[mergeDate][commit.author_date.strftime("%Y%m%dT%H%M%S")]  = {'commit': commit.hash, 'modType': str(mod.change_type), 'fileName': mod.new_path, 'oldFileName': mod.old_path}   
                                if 'MODIFY' in str(mod.change_type):
                                    modCategory = list(categorizeCalModification(path_to_repo, commit.hash, mod.new_path))
                                    if not modCategory:
                                        modCategory = ['notClassified']
                                    gitHubHistory[mergeDate][commit.author_date.strftime("%Y%m%dT%H%M%S")]['modCategory'] = modCategory
                               
    return(gitHubHistory)                           

In [None]:
def round_sigfigs(num, sig_figs):
    import math
    """Round to specified number of sigfigs.
    from: http://code.activestate.com/recipes/578114-round-number-to-specified-number-of-significant-di/

    >>> round_sigfigs(0, sig_figs=4)
    0
    >>> int(round_sigfigs(12345, sig_figs=2))
    12000
    >>> int(round_sigfigs(-12345, sig_figs=2))
    -12000
    >>> int(round_sigfigs(1, sig_figs=2))
    1
    >>> '{0:.3}'.format(round_sigfigs(3.1415, sig_figs=2))
    '3.1'
    >>> '{0:.3}'.format(round_sigfigs(-3.1415, sig_figs=2))
    '-3.1'
    >>> '{0:.5}'.format(round_sigfigs(0.00098765, sig_figs=2))
    '0.00099'
    >>> '{0:.6}'.format(round_sigfigs(0.00098765, sig_figs=3))
    '0.000988'
    """
    if num != 0:
        return round(num, -int(math.floor(math.log10(abs(num))) - (sig_figs - 1)))
    else:
        return 0  # Can't take the log of 0

In [None]:
def categorizeCalModification(repository,commit,fileName):
    
    ### types of content modifications:
    ### data-affecting: parameters added, parameters deleted, value changes, value truncations type 2 (3 sig figs)
    ### non-data-affecting: serial number changes, value truncations type 1 (7 sig figs), notes changed, line endings changed
    
    from io import StringIO
    
    modification = []
    parserError = 0
    
    git_command = ['git', 'show','--pretty=format:"%H"', commit + '^1']
    
    git_query = Popen(git_command, cwd=repository, stdout=PIPE, stderr=PIPE, encoding='utf-8', errors='ignore')
    commitEntry = re.search(r'"([a-zA-Z0-9]{0,40})"',git_query.stdout.readline())
    git_query.stdout.close()
    if commitEntry:
        previousCommit = commitEntry.group(1)
    else:
        return
    
    git_command_file1 = ['git','cat-file','-p',previousCommit + ':./' + fileName]
    git_command_file2 = ['git','cat-file','-p',commit + ':./' + fileName]
    git_query_file1 = Popen(git_command_file1, cwd=repository, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    file1_read = git_query_file1.stdout.read()
    git_query_file1.stdout.close()
    file1 = StringIO(file1_read)
    git_query_file2 = Popen(git_command_file2, cwd=repository, stdout=PIPE, stderr=PIPE, universal_newlines=True)
    file2_read = git_query_file2.stdout.read()
    git_query_file2.stdout.close()
    file2 = StringIO(file2_read)
    
    try:
        df_file1 = pd.read_csv(file1, sep=",", converters = {'value': np.float64}, float_precision='round_trip')
    except ValueError:
        try:
            file1 = StringIO(file1_read)
            df_file1 = pd.read_csv(file1, sep=",", float_precision='round_trip')
        except ValueError:
            modification.append('parserError:PreviousValueFormatInvalid')
            parserError = 1
        
    try:
        df_file2 = pd.read_csv(file2, sep=",", converters = {'value': np.float64}, float_precision='round_trip')
    except ValueError:
        try:     
            file2 = StringIO(file2_read)
            df_file2 = pd.read_csv(file2, sep=",")
        except ValueError:
            modification.append('parserError:CurrentValueFormatInvalid')
            parserError = 1
        
    if parserError == 1:
        return set(modification)
        
    if fileName.endswith('.csv'):
        
        df_file1.set_index('name',inplace=True)
        df_file2.set_index('name',inplace=True)
    
        parametersDeleted = []
        parametersAdded = []
        linesDeleted = df_file1[~df_file1.index.isin(df_file2.index)]
        if not linesDeleted.empty:
            parametersDeleted.append(list(linesDeleted.index.values))
            modification.append('parametersDeleted')
        linesAdded = df_file2[~df_file2.index.isin(df_file1.index)]
        if not linesAdded.empty:
            parametersAdded.append(list(linesAdded.index.values))
            modification.append('parametersAdded')

        paramsDeleted = [item for sublist in parametersDeleted for item in sublist]
        paramsAdded = [item for sublist in parametersAdded for item in sublist]
    
        for calCoeff, row in df_file1.iterrows():
            if calCoeff not in paramsDeleted and calCoeff not in paramsAdded:
                cal1 = row['value']
                cal2 = df_file2.loc[calCoeff,'value']
                if isinstance(cal1, str) or isinstance(cal2, str):
                    if isinstance(cal1, str):
                        try:
                            cal1 = float(cal1)
                        except ValueError:
                            if ',' in cal1:
                                try:
                                    cal1 = cal1.strip('[]').split(',')
                                except ValueError:
                                    modification.append('parserError:PreviousValueFormatInvalid')
                    if isinstance(cal2, str):
                        try:
                            cal2 = float(cal2)
                        except ValueError:
                            if ',' in cal2:
                                try:
                                    cal2 = cal2.strip('[]').split(',')
                                except ValueError:
                                    modification.append('parserError:CurrentValueFormatInvalid')
                if isinstance(cal1, float) and isinstance(cal2, float):
                    if cal2 != cal1:
                        if round_sigfigs(cal2,7) == cal1 or round_sigfigs(cal1,7) == cal2:
                            modification.append('valueResolutionChanged_7sigfigs')
                        elif round_sigfigs(cal2,3) == cal1 or round_sigfigs(cal1,3) == cal2:
                            modification.append('valueResolutionChanged_3sigfigs')
                        else:
                            modification.append('valuesModified')
                elif isinstance(cal1, str) and isinstance(cal2, str):
                    if cal2 != cal1:
                        modification.append('stringValuesModified')
                elif isinstance(cal1, list) and isinstance(cal2, list):
                    if cal2 != cal1:
                        for x,y in zip(cal1,cal2):
                            if x!= y:
                                try:
                                    x = float(x)
                                    y = float(y)
                                    if round_sigfigs(y,7) == x or round_sigfigs(x,7) == y:
                                        modification.append('valueResolutionChanged_7sigfigs')
                                    elif round_sigfigs(y,2) == x or round_sigfigs(x,2) == y:
                                        modification.append('valueResolutionChanged_3sigfigs')
                                    else:
                                        modification.append('valuesModified')
                                except ValueError:
                                    modification.append('comparisonError:ValueFormatInvalid')
                elif type(cal1) != (type(cal2)):
                    modification.append('comparisonError:ValueFormatIncompatible') 
                    
                sn1 = row['serial']
                sn2 = df_file2.loc[calCoeff,'serial']
                if isinstance(sn1, str) and isinstance(sn2, str):
                    if sn1 != sn2:
                        modification.append('serialNumberChanged')
                else:
                    modification.append('comparisonError:incompatibleSerialNumberFormats')
                notes1 = row['notes']
                notes2 = df_file2.loc[calCoeff,'notes']
                if isinstance(notes1, str) and isinstance(notes2, str):
                    if notes1 != notes2:
                        modification.append('notesLineEndingsChanged')
                else:
                    modification.append('comparisonError:incompatibleNotesFormats')

    elif fileName.endswith('.ext'):
        if df_file1.equals(df_file2):
            modification.append('formattingLineEndingsChanged')
        else:
            modification.append('valuesModified')
            
    return set(modification)

In [None]:
def deploymentLookup(RefDes, lookupDate):
    ### Look up deployment number and asset ID for a reference designator and date.
    ### TODO: configure to handle over-lapping deployments and return all relevant assetIDs and deploymentNumbers
    
    assetID = 'NaN'
    deploymentNumber = 'NaN'
    for deploy in RefDes_dict_byRefDes[RefDes]:
        if 'nan' in str(deploy['deployEnd']):
            if deploy['deployDate'] < lookupDate:
                assetID = deploy['AssetID']
                deploymentNumber = deploy['deployment']
        else:
            deployEnd = datetime.datetime.strptime(str(deploy['deployEnd']), '%Y-%m-%dT%H:%M:%S')
            if deploy['deployDate'] < lookupDate and deployEnd > lookupDate:
                assetID = deploy['AssetID']
                deploymentNumber = deploy['deployment']
        
    return(assetID, deploymentNumber)

This is the start of the main code block.  You may need to alter the regEx on line 14 to match your local directory structure.

In [None]:
### read directory listing of github directory LOCALLY

githubFileList = []

for sensor in sensorList:
    sensorDir = localRepo + 'calibration/' + sensor
    fileList = os.listdir(sensorDir)
    for csvFile in fileList:
        if str.startswith(csvFile,filePrefix):
            githubFileList.append(localRepo + 'calibration/' + sensor + '/' + csvFile)

sensorCals = {}
for githubFile in githubFileList:
    fileBits = re.search(r"/.*/.*/.*(/.*/.*/((.*)__(.*).csv))",githubFile)
    if fileBits:
        if fileBits.group(3) not in sensorCals:
            sensorCals[fileBits.group(3)] = {'calFile': []}
        sensorCals[fileBits.group(3)]['calFile'].append([datetime.datetime.strptime(fileBits.group(4), '%Y%m%d'),fileBits.group(2),fileBits.group(1)])


In [None]:
### load in github vocab list and convert to Reference Designator dictionary
vocabList = pd.read_csv(localRepo + '/vocab/vocab.csv')
vocab_dict = vocabList.set_index('Reference_Designator').T.to_dict('series')

### load in github bulk asset records and create Asset ID dictionary for manufacturer's serial number
sensorList = pd.read_csv(localRepo + '/bulk/sensor_bulk_load-AssetRecord.csv')
assetID_dict = pd.Series(sensorList["Manufacturer's Serial No./Other Identifier"].values, index=sensorList['ASSET_UID']).to_dict()

### load in github deployment sheets, sort by Asset ID, load deployment info in Reference designator dictionary
allFiles = glob.glob(localRepo + "deployment/*.csv")
df_deploy = pd.concat([pd.read_csv(f, skip_blank_lines = True, comment='#') for f in allFiles], ignore_index = True, sort=True)
df_deploy_sort = df_deploy.sort_values(by=["sensor.uid","startDateTime"],ascending=False)

### Create dictionary indexed by Asset ID
RefDes_dict = {}
for i in df_deploy_sort['sensor.uid'].unique():
    RefDes_dict[i] = [{'deployDate':datetime.datetime.strptime(df_deploy_sort['startDateTime'][j], '%Y-%m-%dT%H:%M:%S'), 'deployEnd': df_deploy_sort['stopDateTime'][j], 'RefDes':df_deploy_sort['Reference Designator'][j], 'deployment':df_deploy_sort['deploymentNumber'][j], 'firstRawFile':'none', 'rawSN':'-99999'} for j in df_deploy_sort[df_deploy_sort['sensor.uid']==i].index]

### Create dictionary indexed by Reference Designator
RefDes_dict_byRefDes = {}
for i in df_deploy_sort['Reference Designator'].unique():
    RefDes_dict_byRefDes[i] = [{'deployDate':datetime.datetime.strptime(df_deploy_sort['startDateTime'][j], '%Y-%m-%dT%H:%M:%S'), 'deployEnd':df_deploy_sort['stopDateTime'][j], 'AssetID':df_deploy_sort['sensor.uid'][j],  'deployment':df_deploy_sort['deploymentNumber'][j]} for j in df_deploy_sort[df_deploy_sort['Reference Designator']==i].index]


In [None]:
### Determine which cal files are associated with active deployments

gitHubRootURL = 'https://github.com/ooi-integration/asset-management/blob/master'

for key,values in RefDes_dict.items():
    if key in sensorCals:
        calHistory = sensorCals[key]
        for deployment in RefDes_dict[key]:
            calDateList = list(filter(lambda d: d[0] < deployment['deployDate'], calHistory['calFile']))
            if calDateList:
                deploymentCalFile = min(calDateList, key = lambda x: abs(x[0]-deployment['deployDate']))
                if deploymentCalFile:
                    deployment['calFile'] = deploymentCalFile[1]
                    deployment['calFileURL'] = gitHubRootURL + deploymentCalFile[2]
            else:
                ### Print out sensor deployments with no valid cal file
                print('no valid cal file?')
                print(calHistory)
                print(key + ' ' + str(deployment['deployDate']) + ' ' + str(deployment['deployEnd']) + ' ' + deployment['RefDes'])
                

You may need to modify the regEx expressions on line 7 and 10 to match your local directory and filename structures.

In [None]:
### Retrieve modification history for each cal file in the githubFileList

fileModHistory = {}

for calFile in githubFileList:
    print(calFile)
    regEx_calFile = r"(" + localDir + r".*/)(.*/.*/)(.*\..*)"
    fileBits = re.search(regEx_calFile, calFile)
    if fileBits:
        regEx_instrument = r"(" + filePrefix + r"\S+-\d{5}-\d{5})"
        instrument = re.search(regEx_instrument,fileBits.group(3))
        if instrument:
            if instrument.group(1) not in fileModHistory:
                fileModHistory[instrument.group(1)] = {}
            fileModHistory[instrument.group(1)][fileBits.group(3)] = gitHubMine(fileBits.group(1),fileBits.group(2),fileBits.group(3))
    

In [None]:
### Classify modifications and create annotation text

with open('annotationMissing.txt', 'r') as file:
    annotationMissing = file.read().replace('\n','')
file.close()  

with open('annotationModify.txt', 'r') as file:
    annotationModify = file.read().replace('\n','')
file.close() 

with open('annotationTruncated.txt', 'r') as file:
    annotationTruncated = file.read().replace('\n','')
file.close() 

with open('annotationDownstream.txt', 'r') as file:
    annotationDownstream = file.read().replace('\n','')
file.close()

changeHistory=[]

def annotate(refDes, deployment, changeDate, startDate, endDate, URL, changeType):
    if 'coefficient' in changeType:
        if 'resolution' in changeType:
            annotationString = annotationTruncated
        if 'added' in changeType or 'deleted' in changeType or 'modified' in changeType:
            annotationString = annotationModify
    elif 'file' in changeType or 'File' in changeType:
        annotationString = annotationMissing
    else:
        annotationString = 'NaN'
        
    if 'NaN' not in annotationString:
        annotation = annotationString.format(refDes, deployment, changeDate, deployment, startDate, endDate, URL)
    else:
        annotation = 'NaN'
        
    return annotation

def annotateDownstream(downstreamSensor, upstreamSensor, changeDate, startDate, endDate, URL):
    annotation = annotationDownstream.format(downstreamSensor, upstreamSensor, downstreamSensor, changeDate, downstreamSensor, startDate, endDate, upstreamSensor, URL)
        
    return annotation

def logChange(changeType):
    import subprocess
    
    Array = vocab_dict[deployment['RefDes']]['TOC_L1']
    Platform = vocab_dict[deployment['RefDes']]['TOC_L2']
    Node = vocab_dict[deployment['RefDes']]['TOC_L3']
    Instrument = vocab_dict[deployment['RefDes']]['Instrument']
    assetID = deployment['calFile'].split('__')[0]
    sensorSN = assetID_dict[assetID]
    annotation = annotate(deployment['RefDes'], deployment['deployment'], changeDate, deployment['deployDate'], dateRangeEnd, deployment['calFileURL'], changeType)
    change = {'Array': Array, 'Platform': Platform, 'Node': Node, 'Instrument': Instrument, 'RefDes': deployment['RefDes'], 'Asset ID': assetID, 'Serial Number': sensorSN, 'deployment':deployment['deployment'], 'gitHub changeDate': changeDate, 'OOI changeDate': '?', 'file': deployment['calFile'], 'URL': deployment['calFileURL'], 'changeType': changeType, 'dateRangeStart': deployment['deployDate'], 'dateRangeEnd': dateRangeEnd, 'annotation': annotation}
    changeHistory.append(change)
    # If CTD, check for downstream sensors to log
    if 'CTD' in deployment['RefDes']:
        refDes = deployment['RefDes'].split('-')
        platform = refDes[0]
        node = refDes[1]
        PD = [193, 194, 195]
        affectedList = []
        for x in range(len(PD)):
            programCall = localDir + 'preload-database/affectedQuery.py ' + str(PD[x]) + ' ' + platform + ' ' + node
            output = subprocess.check_output(programCall, shell=True)
            outputList = output.decode('utf-8').split('\n')
            for y in range(len(outputList)):
                affectedList.append(outputList[y].split(' ')[0])         
        affectedList_filtered = [x for x in set(affectedList) if len(x) > 5 and 'CTD' not in x]
        print(affectedList_filtered)
        for z in range(len(affectedList_filtered)):
            Array = vocab_dict[affectedList_filtered[z]]['TOC_L1']
            Platform = vocab_dict[affectedList_filtered[z]]['TOC_L2']
            Node = vocab_dict[affectedList_filtered[z]]['TOC_L3']
            Instrument = vocab_dict[affectedList_filtered[z]]['Instrument']
            (assetID, deploymentNumber) = deploymentLookup(affectedList_filtered[z], changeDate)
            if 'NaN' not in str(assetID) and 'NaN' not in str(deploymentNumber):
                sensorSN = assetID_dict[assetID]
                annotation = annotateDownstream(affectedList_filtered[z], deployment['RefDes'], changeDate, deployment['deployDate'], dateRangeEnd, deployment['calFileURL'])
                change = {'Array': Array, 'Platform': Platform, 'Node': Node, 'Instrument': Instrument, 'RefDes': affectedList_filtered[z], 'Asset ID': assetID, 'Serial Number': sensorSN, 'deployment':deploymentNumber, 'gitHub changeDate': changeDate, 'OOI changeDate': '?', 'file': deployment['calFile'], 'URL': deployment['calFileURL'], 'changeType': changeType, 'dateRangeStart': deployment['deployDate'], 'dateRangeEnd': dateRangeEnd, 'annotation': annotation}
                changeHistory.append(change)
    
for key,values in RefDes_dict.items():
    if key in RefDes_dict:
        print(key)
        for deployment in RefDes_dict[key]:
            if 'calFile' in deployment:
                print(deployment['calFile'])
                print(fileModHistory[key][deployment['calFile']])
                for merge in fileModHistory[key][deployment['calFile']]:
                    # only changes that occur after a deployment has taken place have the potential to affect data
                    if 'unknown' not in str(merge):
                        changeDate = datetime.datetime.strptime(merge, '%Y%m%dT%H%M%S')
                        if 'nan' in str(deployment['deployEnd']):
                            deployEnd = datetime.datetime.now()
                        else:
                            deployEnd = datetime.datetime.strptime(deployment['deployEnd'], '%Y-%m-%dT%H:%M:%S')
                        if changeDate > deployment['deployDate']:
                            if changeDate < deployEnd:
                                dateRangeEnd = changeDate
                            elif changeDate > deployEnd:
                                dateRangeEnd = deployEnd
                            addList = []
                            deleteList = []
                            renameList = []
                            modifyList = []
                        
                            for modification in fileModHistory[key][deployment['calFile']][merge]:                               
                                if 'ADD' in fileModHistory[key][deployment['calFile']][merge][modification]['modType']:
                                    addList.append(modification)
                                if 'DELETE' in fileModHistory[key][deployment['calFile']][merge][modification]['modType']:
                                    deleteList.append(modification)
                                if 'RENAME' in fileModHistory[key][deployment['calFile']][merge][modification]['modType']:
                                    renameList.append([modification, fileModHistory[key][deployment['calFile']][merge][modification]['fileName'], fileModHistory[key][deployment['calFile']][merge][modification]['oldFileName']] )
                                if 'MODIFY' in fileModHistory[key][deployment['calFile']][merge][modification]['modType']:
                                    modifyList.append([modification, fileModHistory[key][deployment['calFile']][merge][modification]['modCategory']])
                                                 
                                
                            if len(addList) > 0:
                                print('file was added that was missing:')        
                                print(addList)
                                logChange('Missing file added')
                                # file was "missing" and affects data = YES
                            if len(deleteList) > 0:
                                print('file deleted:')
                                print(deleteList)
                                logChange('File deleted')
                                # file was "missing" for a period of time and affects data = YES
                            if len(renameList) > 0:
                                print('file renamed:')
                                if len(renameList) > 1:
                                    finalRename = max(renameList, key=lambda x: x[0])
                                    oldFile = finalRename[1]
                                    newFile = finalRename[2]
                                else:
                                    oldFile = renameList[0][1]
                                    newFile = renameList[0][2]
                                print(oldFile + ' renamed to: ' + newFile)
                                print(deployment['deployDate'])
                                # break fileName into asset ID and date
                                oldFileBits = re.search(r"calibration/.*/(AT.*)__(\d{8})\.csv",oldFile)
                                newFileBits = re.search(r"calibration/.*/(AT.*)__(\d{8})\.csv",newFile)
                                if oldFileBits and newFileBits:
                                    # if AT# changed file was missing for new AT# and affects data = YES
                                    if oldFileBits.group(1) not in newFileBits.group(1):
                                        print('file name changed to new asset ID')
                                        logChange('File renamed with new Asset ID')
                                    # if filedate changed and new date > deploymentStart date affects data = YES    
                                    if oldFileBits.group(2) not in newFileBits.group(2):
                                        print('file date name changed')
                                        if datetime.datetime.strptime(oldFileBits.group(2),'%Y%m%d') > deployment['deployDate'] or \
                                        datetime.datetime.strptime(newFileBits.group(2),'%Y%m%d') > deployment['deployDate']:
                                            print('file date changed')  
                                            logChange('File renamed with new calibration date')
                                else:
                                    logChange('File renamed, does not match format')
                            if len(modifyList) > 0:
                                print('file modified')                           
                                for entry in modifyList:
                                    if 'valuesModified' in entry[1] or 'stringValuesModified' in entry[1] or 'comparisonError' in entry[1] or 'parserError' in entry[1]:
                                        logChange('calibration coefficients were modified')
                                    elif 'parametersDeleted' in entry[1]:
                                        logChange('calibration coefficients were deleted')
                                    elif 'parametersAdded' in entry[1]:
                                        logChange('calibration coefficients were added') 
                                    elif 'valueResolutionChanged_3sigfigs' in entry[1]:
                                        logChange('calibration coefficient resolution was added')  
                

In [None]:
### Output all data-affecting changes to csv file 

with open('gitHubCalibrationSheets_dataAffectingChanges_v3.csv', 'w', encoding='utf8', newline='') as output_file:
    fc = csv.DictWriter(output_file, fieldnames=changeHistory[0].keys())
    fc.writeheader()
    fc.writerows(changeHistory)

In [None]:
### Output all modifications to csv

with open('gitHubCalibrationSheets_allChanges_v3.csv','w') as f:          
    for key,values in fileModHistory.items():
        for calFile in fileModHistory[key]:
            for merge in fileModHistory[key][calFile]:
                for mod in fileModHistory[key][calFile][merge]:
                    f.write("%s, %s, %s\n" % (key,merge,fileModHistory[key][calFile][merge][mod]))
        