# feature engineering
Start:
* one database with two collections that hold all the sprints and issues with all the raw data
* only addition to this raw data is a field which repository they belong to

End:

* Sprints:
  * Make aggregation of each sprint's metadata:
    * sprint planned duration in days
    * sprint overdue boolean (completedDate > endDate)
    * Number of issues inside sprint
    * Median number of issues with changed sprint
* Issues:
  * predictor: has sprint field in change log (boolean)
  * Number of sprint fields in change log (int)
  * Add originSprintId field (uuid)
  * Add above sprint metadata fields to every issue
  * All not used (custom)fields should be removed

In [5]:
# Auxiliary
import pymongo
import re
import json
import numpy as np
from datetime import datetime
from dateutil import parser
import logging as log

name = './logs/02.log'
log.basicConfig(filename=name, level=log.DEBUG, force=True, format='%(asctime)s %(message)s: ', datefmt='%Y-%m-%d %H:%M:%S')

client = pymongo.MongoClient("mongodb://localhost:27017")
db = client['JiraRepos']

coll_issues = db['01_Issues']
coll_sprints = db['02_Sprints']

In [11]:
# Sprints
sprints = coll_sprints.find()

for sprint in sprints:
    # sprint planned duration in days
    try:
        startDate = parser.parse(sprint['startDate'])
        endDate = parser.parse(sprint['endDate'])
        plannedDuration =  (endDate - startDate).days
    except:
        plannedDuration = np.nan

    # Number of issues inside sprint
    issues = sprint['issues']
    noOfIssues = len(issues)
    changes_in_issues = 0
    sprint_changes_in_issues = 0
    
    

    # Get total number of changes and sprint changes 
    for issue in issues:
        if str(issue)[0].isalpha():
            doc = coll_issues.find_one({"id": issue})
            changes_in_issues += doc['issue.totalHistories']
            sprint_changes_in_issues += doc['issue.sprintHistories']
            update = {'plannedDuration': plannedDuration, 'noOfIssues': noOfIssues, "totalHistories": changes_in_issues, "sprintHistories": sprint_changes_in_issues}

    coll_sprints.update_one({'id': sprint['id']}, {'$set': update})


In [6]:
# Add above sprint metadata fields to every issue
issues = coll_issues.find()
d = dict()

for issue in issues:
    try:
        originSprintId = issue['originSprintId']
    except:
        log.debug("Found issue without originSprintId field: " + issue['id'])
        continue

    sprint = coll_sprints.find_one({"id": originSprintId})
    
    plannedDuration = sprint['plannedDuration']
    noOfIssues = sprint['noOfIssues']
    totalHistories = sprint['totalHistories']
    sprintHistories = sprint['sprintHistories']
    
    update = {
        'sprint.plannedDuration': plannedDuration,
        'sprint.noOfIssues': noOfIssues,
        'sprint.totalHistories':totalHistories,
        'sprint.sprintHistories':sprintHistories
    }
    
    coll_issues.update_one({"id": issue['id']}, {'$set': update})

In [None]:
# Sprints that need the new Issue data
# Median number of issues with changed sprint