# Extract, transform and load 

Start: 
* MongoDB database full with all 2 million + issues as defined by Montgomery et al.
* Each repository is their own collection 

End:
* MongoDB database with 2 collections that hold all from different repositories:
    * issues
    * sprints
* this means that each document needs a new field with which repository it belongs to

Steps:
* Iterate over all the issues per repository
* If has sprint field:
  * Add repo name as field
  * Get sprint info from issue
  * Check if sprint already exists, if not, add it to sprint collection
  * Add the issue to the issue collection

In [41]:
# Auxiliary
import pymongo
import re
import json

client = pymongo.MongoClient("mongodb://localhost:27017")

In [3]:
def get_customfield_from_collection(collection):
    col = db[collection]
    issues = col.find()
    for issue in issues:
        fields = issue['fields']
        for field in fields:
            field_value = fields[field]
            if type(field_value) is list and len(field_value)>0 and "com.atlassian.greenhopper.service.sprint.Sprint" in field_value[0] and "custom":
                return(field)

In [52]:
def get_and_store_sprint_from_issue(sprintstring, issueid):
    d = {}
    id = re.search(r'id=(.*?),', sprintstring).group(1)
    state = re.search(r'state=(.*?),', sprintstring).group(1)
    result = client.JiraRepoIssues.Issues.find_one({"id": id})
    if result is not None:
        print("does not exists")
    else:
        print("does exists")
        d = {}
        name = re.search(r'name=(.*?),', sprintstring).group(1)
        startDate = re.search(r'startDate=(.*?),', sprintstring).group(1)
        endDate = re.search(r'endDate=(.*?),', sprintstring).group(1)
        completeDate = re.search(r'completeDate=(.*?),', sprintstring).group(1)
        activatedDate = re.search(r'activatedDate=(.*?),', sprintstring).group(1)
        d['id'] = id
        d['state'] = state
        d['name'] = name
        d['startDate'] = startDate
        d['endDate'] = endDate
        d['completeDate'] = completeDate
        d['activatedDate'] = activatedDate
        d['issues'] = [issueid]
        client.JiraRepoIssues.Sprints.insert_one(d)
        print(d)


In [53]:
sprint1 = 'com.atlassian.greenhopper.service.sprint.Sprint@352cab98[id=1255,rapidViewId=471,state=CLOSED,name=NXRM Neo Sprint 25 Holding,startDate=<null>,endDate=<null>,completeDate=<null>,activatedDate=<null>,sequence=1510,goal=<null>,autoStartStop=false]'
sprint2 = 'com.atlassian.greenhopper.service.sprint.Sprint@352cab98[id=1255,rapidViewId=471,state=CLOSED,name=NXRM Neo Sprint 25 Holding,startDate=<null>,endDate=<null>,completeDate=<null>,activatedDate=<null>,sequence=1510,goal=<null>,autoStartStop=false]'
get_and_store_sprint_from_issue(sprint1, 1)
get_and_store_sprint_from_issue(sprint2, 1)


does exists
{'id': '1255', 'state': 'CLOSED', 'name': 'NXRM Neo Sprint 25 Holding', 'startDate': '<null>', 'endDate': '<null>', 'completeDate': '<null>', 'activatedDate': '<null>', 'issues': ['test', 'list'], '_id': ObjectId('627d21bf52f76f0bc389f41e')}
does exists
{'id': '1255', 'state': 'CLOSED', 'name': 'NXRM Neo Sprint 25 Holding', 'startDate': '<null>', 'endDate': '<null>', 'completeDate': '<null>', 'activatedDate': '<null>', 'issues': ['test', 'list'], '_id': ObjectId('627d21bf52f76f0bc389f41f')}


In [5]:
# dictionary to store results. manually added SecondLife due to differing name
d = {}

# get all issues to iterate over
db = client['JiraRepos']
collections = db.list_collection_names()

# iterate over collections
for collection in collections:
    print("Starting " + collection)
    field_name = get_customfield_from_collection(collection)
    if field_name is not None:
        d[collection] = field_name

d

Starting Mindville
Starting Jira
Starting IntelDAOS
Starting Mojang
Starting Sakai
Starting RedHat
Starting SecondLife
Starting Qt
Starting JiraEcosystem
Starting Spring
Starting MongoDB
Starting MariaDB
Starting Hyperledger
Starting Sonatype
Starting Apache
Starting JFrog


{'Jira': 'customfield_11930',
 'RedHat': 'customfield_12310940',
 'Qt': 'customfield_10302',
 'Spring': 'customfield_10480',
 'MongoDB': 'customfield_10557',
 'MariaDB': 'customfield_10400',
 'Hyperledger': 'customfield_10004',
 'Sonatype': 'customfield_11001',
 'Apache': 'customfield_12310921'}

In [17]:
db_src = client['JiraRepos']
db_dest = client['JiraReposIssues']

for collection in ['Sonatype']:
# for collection in d.keys():
    print("Starting " + collection)
    col_src = db_src[collection]
    col_dest = db_dest['Issues']
    issues = col_src.find()
    for issue in issues:
        fields = issue['fields']
        for field in fields:
            # check if issue has field that holds sprint info
            if d[collection] in field and fields[field] is not None:
                # add repo field and prepend repo in id field
                # issue['repository'] = collection
                # issue['id'] = collection+str(issue['id'])
                # col_dest.insert_one(issue)
                # check for sprint data and store
                print(issue['fields'][field], issue['id'])
                get_and_store_sprint_from_issue(issue['fields'][field][0], issue['id'])


Starting Sonatype
['com.atlassian.greenhopper.service.sprint.Sprint@352cab98[id=1255,rapidViewId=471,state=FUTURE,name=NXRM Neo Sprint 25 Holding,startDate=<null>,endDate=<null>,completeDate=<null>,activatedDate=<null>,sequence=1510,goal=<null>,autoStartStop=false]'] 835086
1255
['com.atlassian.greenhopper.service.sprint.Sprint@352cab98[id=1255,rapidViewId=471,state=FUTURE,name=NXRM Neo Sprint 25 Holding,startDate=<null>,endDate=<null>,completeDate=<null>,activatedDate=<null>,sequence=1510,goal=<null>,autoStartStop=false]'] 833245
1255
['com.atlassian.greenhopper.service.sprint.Sprint@938d708[id=1452,rapidViewId=540,state=CLOSED,name=NXRM Sentinels Sprint 23,startDate=2021-12-08T14:58:00.000Z,endDate=2021-12-22T14:58:00.000Z,completeDate=2021-12-22T14:43:13.569Z,activatedDate=2021-12-08T15:04:05.744Z,sequence=1460,goal=38 Projected\n10 In-Flight \nDevs 4.4\nGoal - HA - DES specific events,autoStartStop=false]', 'com.atlassian.greenhopper.service.sprint.Sprint@1b9fe03e[id=1460,rapidView