# Extract, transform and load 

Start: 
* MongoDB database full with all 2 million + issues as defined by Montgomery et al.
  * Each repository is their own collection 
* Have Jira endpoints and credentials

End:
* MongoDB database with 2 collections that hold all from different repositories:
    * issues
    * sprints
* this means that each document needs a new field with which repository it belongs to

Steps:
* Iterate over all the issues per repository
* If has sprint field:
  * Add repo name as field
  * Get sprint info from issue
  * Check if sprint already exists, if not, add it to sprint collection
  * Add the issue to the issue collection

In [1]:
# Auxiliary
import pymongo
import re
import json
import logging

logname = './logs/01-1.log'
logging.basicConfig(filename=logname, level=logging.DEBUG, force=True, format='%(asctime)s %(message)s: ', datefmt='%Y-%m-%d %H:%M:%S')

client = pymongo.MongoClient("mongodb://localhost:27017")

In [2]:
def get_customfield_from_collection(collection):
    col = db[collection]
    issues = col.find()
    for issue in issues:
        fields = issue['fields']
        for field in fields:
            field_value = fields[field]
            if type(field_value) is list and len(field_value)>0 and "com.atlassian.greenhopper.service.sprint.Sprint" in field_value[0] and "custom":
                return(field)

In [3]:
def get_and_store_sprint_from_issue(sprintstring, issueid, repo):
    d = {}
    id = re.search(r'id=(.*?),', sprintstring).group(1)
    state = re.search(r'state=(.*?),', sprintstring).group(1)
    result = client.JiraRepos['02_Sprints'].find_one({"id": repo + str(id)})
    # only create and insert if it does not already exist.
    if result is None:
        d = {}
        name = re.search(r'name=(.*?),', sprintstring).group(1)
        startDate = re.search(r'startDate=(.*?),', sprintstring).group(1)
        endDate = re.search(r'endDate=(.*?),', sprintstring).group(1)
        completeDate = re.search(r'completeDate=(.*?),', sprintstring).group(1)
        d['id'] = str(repo) + str(id)
        d['state'] = state
        d['name'] = name
        d['startDate'] = startDate
        d['endDate'] = endDate
        d['completeDate'] = completeDate
        # put in try block due to a few edge cases having no activatedDate
        try:
            activatedDate = re.search(r'activatedDate=(.*?),', sprintstring).group(1)
            d['activatedDate'] = activatedDate
        except:
            d['activatedDate'] = "not present"
        d['issues'] = [issueid]
        client.JiraRepos['02_Sprints'].insert_one(d)
    else:
        client.JiraRepos['02_Sprints'].update_one({'id': repo + str(id)}, {'$push': {'issues': issueid}})
    return repo + str(id)

In [4]:
# dictionary to store results
d = {}

# get all issues to iterate over
db = client['JiraRepos']
collections = db.list_collection_names()

# iterate over collections
for collection in collections:
    if "02_Sprints" == collection:
        continue
    logging.info("Starting " + collection)
    field_name = get_customfield_from_collection(collection)
    if field_name is not None:
        d[collection] = field_name

d

{'Jira': 'customfield_11930',
 'RedHat': 'customfield_12310940',
 'Qt': 'customfield_10302',
 'Spring': 'customfield_10480',
 'MongoDB': 'customfield_10557',
 'MariaDB': 'customfield_10400',
 'Hyperledger': 'customfield_10004',
 'Sonatype': 'customfield_11001',
 'Apache': 'customfield_12310921'}

In [7]:
db_src = client['JiraRepos']
db_dest = client['JiraRepos']

logging.info("all keys: " + str(d.keys))

col_dest = db_dest['01_Issues']
col_dest.drop() 
for collection in d.keys():
# for collection in ['MariaDB']:
    logging.info("Starting " + collection)
    col_src = db_src[collection]
    issues = col_src.find()
    for issue in issues:
        insert = False
        fields = issue['fields']
        fields_to_drop = []
        for field in fields:
            # gather all 'customfield_xxxxxx" fields
            if 'customfield' in field:
                fields_to_drop.append(field)
                
            # check if issue has field that holds sprint info
            if d[collection] in field and fields[field] is not None:
                insert = True
                # add repo field and prepend repo in id field
                issue['repository'] = collection
                # add field on whether data is private, indirect public or public
                issue['availability'] = 'indirect'
                issue['id'] = collection+str(issue['id'])
                # check for sprint data and store
                try:
                    origin_sprint_id = get_and_store_sprint_from_issue(issue['fields'][field][0], issue['id'], collection)
                    issue['originSprintId'] = origin_sprint_id
                except:
                    logging.debug(collection, issue['id'], issue['fields'][field])
            
        sprint_changes = 0
        total_changes = issue['changelog']['total']
        histories = issue['changelog']['histories']
        for history in histories:
            for item in history['items']:
                if 'sprint' in item['field'].lower():
                    sprint_changes += 1 
        issue["totalChanges"] = total_changes
        issue["sprintChanges"] = sprint_changes

        if insert:
            # delete all "customfield_xxxx"
            for f in fields_to_drop:
                del issue['fields'][f]
            del issue['changelog']
            col_dest.insert_one(issue)  


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Sotse\AppData\Local\Temp\ipykernel_24024\2905546004.py", line 32, in <cell line: 8>
    origin_sprint_id = get_and_store_sprint_from_issue(issue['fields'][field][0], issue['id'], collection)
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Python39\lib\logging\__init__.py", line 1079, in emit
    msg = self.format(record)
  File "C:\Python39\lib\logging\__init__.py", line 923, in format
    return fmt.format(record)
  File "C:\Python39\lib\logging\__init__.py", line 659, in format
    record.message = record.getMessage()
  File "C:\Python39\lib\logging\__init__.py", line 363, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "C:\Python39\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C

In [None]:
# Get all directly available sprint data