In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import json
import os
from datetime import datetime
from pprint import pprint

import database
import helper
import utils

In [2]:
LOG = utils.CustomLogger("CustomLogger", log_level= "info", display_loglevel= False, display_datetime= False)
PICKLE_LIB = utils.PickleLib(data_path="./data", logger= LOG)

# Data Search

In [468]:
client = database.connect()
db = client.JiraRepos

print(db.list_collection_names())
collection = db['MongoDB']

['Spring', 'Mojang', 'SecondLife', 'JiraEcosystem', 'Mindville', 'Sonatype', 'Sakai', 'Jira', 'Apache', 'MariaDB', 'Hyperledger', 'Qt', 'IntelDAOS', 'JFrog', 'MongoDB', 'RedHat']


#### Summary

In [469]:
def getInRange():
    results = collection.aggregate([
        {
            "$match": {
                "fields.summary": {
                    "$exists": True, 
                    "$type": "string"
                },
                "$expr": {
                    "$and": [
                        {"$lt": [{"$strLenCP": "$fields.summary"}, 70]},
                        {"$gt": [{"$strLenCP": "$fields.summary"}, 39]}
                    ]
                }
            }
        },
        {
            "$sample": {
                "size": 5
            }
        }
    ])
    return results

def getShorter():
    results = collection.aggregate([
        {
            "$match": {
                "fields.summary": {
                    "$exists": True, 
                    "$type": "string"
                },
                "$expr": {
                    "$and": [
                        {"$lt": [{"$strLenCP": "$fields.summary"}, 39]}
                    ]
                }
            }
        },
        {
            "$sample": {
                "size": 5
            }
        }
    ])
    return results

def getLonger():
    results = collection.aggregate([
        {
            "$match": {
                "fields.summary": {
                    "$exists": True, 
                    "$type": "string"
                },
                "$expr": {
                    "$and": [
                        {"$gt": [{"$strLenCP": "$fields.summary"}, 70]}
                    ]
                }
            }
        },
        {
            "$sample": {
                "size": 5
            }
        }
    ])
    return results

In [517]:
summary = []

results = getInRange()
# results = getShorter()
# results = getLonger()

for document in results:
    summary.append(document)
    print("Id: " + str(document['id']) + " Length: " + str(len(document['fields']['summary'])) + ": " + document['fields']['summary'])

Id: 653054 Length: 47: Docs for SERVER-38168: Vendor Zstandard library
Id: 387444 Length: 40: Implement Index Management Specification
Id: 1468306 Length: 51: Rate limit new connection creations (maxConnecting)
Id: 1956117 Length: 60: Fix Health Checking feature flag backport incompatibilities 
Id: 1417364 Length: 41: Implement `ChangeStreamGetMore` operation


#### Arbitrary Structure

In [None]:
def findKeysByCode(data, target_code):
    matching_keys = []
    for key, value in data.items():
        if 'code' in value and value['code'] == target_code:
            matching_keys.append(key)
    return matching_keys

In [None]:
with open('issueTypeMapping.json') as f:
    mappedIssueTypes = json.load(f)

In [None]:
mappedTypes = findKeysByCode(mappedIssueTypes['RedHat'], 'Story')
mappedTypes

In [None]:
tickets = []
results = collection.aggregate([
    {
        "$match": {
            "fields.description": {
                "$exists": True, 
                "$type": "string",
                "$regex": "As a"
            },
            "fields.issuetype.name": {
                "$in": mappedTypes
            }
        }
    },
    {
        "$sample": {
            "size": 10
        }
    }
])

for document in results:
    tickets.append(document)
    print("Id: " + str(document['id']) + " / Description: " + document['fields']['description'])

User Story structure: 
Redhat: 13279134, 13254138, 13282377, 13405745, 14248927(has only the story)

#### Update See use_case_3.ipynb

# Ticket Creation

(Apache, 13066997) -> Status is Open, coulb be intersting 
(Jira, 284325) -> could be a good example for the Description completness Prompt!!
(Qt, 187366) -> intersting for Bug Report STructre!

In [471]:
### Globals ###
LOG.reset()
JIRA = "MongoDB"
FOLDERNAME = "summary"

In [472]:
evo_df = PICKLE_LIB.pickle_load("./jiraEvolutions/load_evolution_dataframe(jiras=[_"+JIRA+"_])", 'gzip')

[Start] 🥒 Loading data from Pickle: "./jiraEvolutions/load_evolution_dataframe(jiras=[_MongoDB_]).pgzip"


	 Data: 100%|#####################################################| 423M/423M [00:04<00:00, 101MB/s]

[ End ] Duration: 00:00:04.9762





In [520]:
sample_id = "1417364"
sample = evo_df[evo_df["issue_id"] == sample_id]
#sample

In [521]:
evolutionStep = 0
ticket = helper.createTicket(sample, evolutionStep)
ticket

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row.loc[:, 'Comments'] = [comment_history]


Unnamed: 0,Jira,IssueId,EvoId,Summary,Description,VersionsAffected,IssueType,Project,Components,CreatedDate,...,Priority,Creator,Reporter,Resolution,IssueLinks,Labels,VersionsFixed,Assignee,TimeSpent,Comments
0,MongoDB,1417364,0,Implement {{ChangeStreamGetMore}} operation,"As the next part of implementing change streams, we need to define a {{ChangeStreamGetMore}} struct that implements {{Operation}}. This operation should wrap the {{GetMore}} operation, calling {{Aggregate::build}} and {{Aggregate::handle_response}} in its own respective {{build}} and {{handle_response}} methods.\r\n\r\nAdditionally, the {{poll_next}} methods on the {{ChangeStream}}'s {{Stream}} implementation should be updated to create and execute a {{ChangeStreamGetMore}} operation, following the pattern used in the {{Cursor}}'s {{poll_next}} implementation.",,New Feature,Rust Driver,,2020-07-22T19:15:15.000+0000,...,Major - P3,Samuel Rossi,Samuel Rossi,,,,,,,"Empty DataFrame Columns: [Author, Created, Comment] Index: []"


# Ticket PreProcessing

In [522]:
def preprocessTickets(ticket):
    
    ### Convert CreatedDate and ResolvedDate to datetime
    c_date = ticket['CreatedDate'].values[0]
    c_dt_obj = datetime.strptime(c_date, '%Y-%m-%dT%H:%M:%S.%f%z')
    ticket['CreatedDate'] = c_dt_obj.strftime('%Y-%m-%d %H:%M:%S')
    r_date = ticket['ResolvedDate'].values[0]
    r_dt_obj = datetime.strptime(r_date, '%Y-%m-%dT%H:%M:%S.%f%z')
    ticket['ResolvedDate'] = r_dt_obj.strftime('%Y-%m-%d %H:%M:%S')

    ### Convert IssueId to Int
    id = ticket['IssueId'].values[0]
    ticket['IssueId'] = int(id)

preprocessTickets(ticket)

In [523]:
helper.saveTicket(FOLDERNAME, ticket, evolutionStep, JIRA, sample_id)

The JSON was successfully saved!
The annotation was created successfully!
The ticket was inserted into the dataset successfully!


# Annotate Ticket

In [None]:
def annotateTicket(ticket, annotation, reason):
    try:
        ticket['ViolationActual'] = annotation
        ticket['ViolationReason'] = reason
        print("Annotation successful.")
    except:
        print("Annotation failed.")

annotateTicket(ticket, "TRUE", """1. Resolution has to be set from 'None' to 'Low Priority'.
                           2. Status has to be set from 'Open' to 'Resolved'.""")

In [None]:
if os.path.isfile("data/" + FOLDERNAME + "/" + FOLDERNAME + "Dataset.csv"):
    dataset = pd.read_csv("./data/" + FOLDERNAME + "/" + FOLDERNAME + "Dataset.csv")
    print("The dataset already exists.")
else:
    dataset = pd.DataFrame(columns=['Jira', 'IssueId', 'EvoId', 'Summary', 'Description', 'VersionsAffected', 'IssueType', 'Project', 'Components', 'CreatedDate', 'ResolvedDate', 'Status', 'Priority', 'Creator', 'Reporter', 'Resolution', 'IssueLinks', 'Labels','VersionsFixed', 'Assignee', 'TimeSpent', 'Comments', 'ViolationActual', 'ViolationReason'])
    print("The dataset was created successfully.")

dataset = pd.concat([dataset, ticket], ignore_index=True)

dataset.to_csv("data/" + FOLDERNAME + "/" + FOLDERNAME + "Dataset.csv", index=False)
print("The ticket was inserted into the dataset successfully!")


In [None]:
dataset = pd.read_csv('data/' + FOLDERNAME + '/' + FOLDERNAME + 'Dataset.csv')
dataset