# Extract, transform and load 

Start: 
* MongoDB database full with all 2 million + issues as defined by Montgomery et al.
  * Each repository is their own collection 
* Have Jira endpoints and credentials

End:
* MongoDB database with 2 collections that hold all from different repositories:
    * issues
    * sprints
* this means that each document needs a new field with which repository it belongs to

Steps:
* Iterate over all the issues per repository
* If has sprint field:
  * Add repo name as field
  * Get sprint info from issue
  * Check if sprint already exists, if not, add it to sprint collection
  * Add the issue to the issue collection

In [1]:
# Auxiliary
import pymongo
import re
import json
import logging
import numpy as np
import pickle
from os.path import exists

logname = './logs/01-1.log'
logging.basicConfig(filename=logname, level=logging.DEBUG, force=True, format='%(asctime)s %(message)s: ', datefmt='%Y-%m-%d %H:%M:%S')

client = pymongo.MongoClient("mongodb://localhost:27017")

In [2]:
def get_customfield_from_collection(collection):
    col = db[collection]
    issues = col.find()
    for issue in issues:
        fields = issue['fields']
        for field in fields:
            field_value = fields[field]
            if type(field_value) is list and len(field_value)>0 and "com.atlassian.greenhopper.service.sprint.Sprint" in field_value[0] and "custom":
                return(field)

In [3]:
def get_and_store_sprint_from_issue(sprintstring, issueid, repo):
    d = {}
    id = re.search(r'id=(.*?),', sprintstring).group(1)
    state = re.search(r'state=(.*?),', sprintstring).group(1)
    result = client.JiraRepos['02_Sprints'].find_one({"id": repo + str(id)})
    # only create and insert if it does not already exist.
    if result is None:
        d = {}
        name = re.search(r'name=(.*?),', sprintstring).group(1)
        startDate = re.search(r'startDate=(.*?),', sprintstring).group(1)
        endDate = re.search(r'endDate=(.*?),', sprintstring).group(1)
        completeDate = re.search(r'completeDate=(.*?),', sprintstring).group(1)
        d['id'] = str(repo) + str(id)
        d['state'] = state
        d['name'] = name
        d['startDate'] = startDate
        d['endDate'] = endDate
        d['completeDate'] = completeDate
        # put in try block due to a few edge cases having no activatedDate
        try:
            activatedDate = re.search(r'activatedDate=(.*?),', sprintstring).group(1)
            d['activatedDate'] = activatedDate
        except:
            d['activatedDate'] = "not present"
        d['issues'] = [issueid]
        client.JiraRepos['02_Sprints'].insert_one(d)
    else:
        client.JiraRepos['02_Sprints'].update_one({'id': repo + str(id)}, {'$push': {'issues': issueid}})
    return repo + str(id)

In [4]:
if exists('repos.pickle'):
    with open('repos.pickle', 'rb') as handle:
        d = pickle.load(handle)
else:
# dictionary to store results
    d = {}

    # get all issues to iterate over
    db = client['JiraRepos']
    collections = db.list_collection_names()

    # iterate over collections
    for collection in collections:
        if "02_Sprints" == collection or '01_Issues' == collection:
            continue
        logging.info("Starting " + collection)
        field_name = get_customfield_from_collection(collection)
        if field_name is not None:
            d[collection] = field_name

    with open('repos.pickle', 'wb') as handle:
        pickle.dump(d, handle, protocol=pickle.HIGHEST_PROTOCOL)



In [5]:
d

{'Jira': 'customfield_11930',
 'RedHat': 'customfield_12310940',
 'Qt': 'customfield_10302',
 'Spring': 'customfield_10480',
 'MongoDB': 'customfield_10557',
 'MariaDB': 'customfield_10400',
 'Hyperledger': 'customfield_10004',
 'Sonatype': 'customfield_11001',
 'Apache': 'customfield_12310921'}

In [6]:
def transfer_fields(old_issue, repo, sourcetype):
    new_issue = {
        'id': repo + str(old_issue['id']),
        'repo': repo,
        'sourceType': sourcetype 
    }
    fields = ['created', 'updated', 'description', 'duedate', 'labels', 'issuelinks', 'subtasks', 'comments']
    nested_fields = {
        'priority': ['id', 'name'],
        'assignee': ['key'],
        'status': ['name', 'description'],
        'votes': ['votes'],
        'issuetype': ['name', 'subtask'],
        'project': ['id'],
        'watches': ['watchCount']
    }
    
    for f in fields:
        try:
            new_issue[f] = old_issue['fields'][f]
        except:
            new_issue[f] = np.nan
    
    for key in nested_fields.keys():
        for v in nested_fields[key]:
            try:
                new_issue[str(key) + '.' + str(v)] = issue['fields'][key][v]
            except:
                new_issue[str(key) + '.' + str(v)] = np.nan

    #misc multi-level nested
    try:
        new_issue['status.statusCategory.name'] = old_issue['fields']['status']['statusCategory']['name']
    except:
        new_issue['status.statusCategory.name'] = np.nan
    try:
        new_issue['project.projectCategory.id'] = old_issue['fields']['project']['projectCategory']['id']
    except:
        new_issue['project.projectCategory.id'] = np.nan

    return new_issue

In [7]:
db_src = client['JiraRepos']
db_dest = client['JiraRepos']

logging.info("all keys: " + str(d.keys))

col_dest = db_dest['01_Issues']
col_dest.drop() 
for collection in d.keys():
# for collection in ['MariaDB']:
    logging.info("Starting " + collection)
    col_src = db_src[collection]
    issues = col_src.find()
    for issue in issues:
        insert = False    #control whether issue is used or not
        
        fields = issue['fields']
        for field in fields:
            # iterate over all fields to find one that holds sprint data
            if d[collection] in field and fields[field] is not None:
                # check if issue has field that holds sprint info
                new_issue = transfer_fields(issue, collection, "indirect")
                try:
                    # check for sprint data and store if present
                    origin_sprint_id = get_and_store_sprint_from_issue(issue['fields'][field][0], new_issue['id'], collection)
                    new_issue['originSprintId'] = origin_sprint_id
                    insert = True    #control whether issue is used or not
                except:
                    logging.debug(collection, issue['id'], issue['fields'][field])

                #count sprint changes
                sprint_changes = 0
                histories = issue['changelog']['histories']
                for history in histories:
                    for item in history['items']:
                        if 'sprint' in item['field'].lower():
                            sprint_changes += 1 
                new_issue["issue.sprintHistories"] = sprint_changes
                new_issue["issue.totalHistories"] = issue['changelog']['total']

                #add all remaining already existing fields
                

                # # delete all "customfield_xxxx"
                # for f in fields_to_drop:
                #     del issue['fields'][f]
                # del issue['changelog']

                #insert the new issue in the new collection
                col_dest.insert_one(new_issue)  


--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\Sotse\AppData\Local\Temp\ipykernel_19324\2402954016.py", line 24, in <cell line: 8>
    origin_sprint_id = get_and_store_sprint_from_issue(issue['fields'][field][0], new_issue['id'], collection)
IndexError: list index out of range

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Python39\lib\logging\__init__.py", line 1079, in emit
    msg = self.format(record)
  File "C:\Python39\lib\logging\__init__.py", line 923, in format
    return fmt.format(record)
  File "C:\Python39\lib\logging\__init__.py", line 659, in format
    record.message = record.getMessage()
  File "C:\Python39\lib\logging\__init__.py", line 363, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "C:\Python39\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  Fil