# 🎛️Program Setup

### Import Libraries

In [None]:
# Standard Python Imports
from collections import defaultdict
from pprint import pprint
import random

# Third-Party Imports (requires pip install)
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm

# Local Libraries
import utils

In [None]:
pd.set_option('display.max_columns', None)  # We want to visualise all columns
pd.set_option('display.max_colwidth', None)  # Don't limit the width of the columns

### Define Globals

In [None]:
# Use Case 2 Globals
DATA_PATH = './jiraEvolutions/'

LOG = utils.CustomLogger('CustomLogger', log_level='info', display_loglevel=False, display_datetime=False)
PICKLE_LIB = utils.PickleLib(data_path=DATA_PATH, logger=LOG)

### Load Data Sources

In [None]:
# Load in the evolution dataframe from the GenerateEvolutionDataframe script
evo_df = PICKLE_LIB.pickle_load(f"{DATA_PATH}load_evolution_dataframe(jiras=[_Hyperledger_])", 'gzip')

In [None]:
evo_df.head(10)

# Cleaning the Data

In [None]:
evo_df.shape

In [None]:
# Add to each for the combind "Jira Issue ID", which is a combination of the Jira name and the Issue ID.
# This field creates a truly unique ID across Jiras and issues.
evo_df['jira_issue_id'] = evo_df.jira + ' ' + evo_df.issue_id

In [None]:
# Clean the data to just be description changes
# evo_df = evo_df[evo_df.field == 'Description']

In [None]:
# Set a minimum required number of evolutions for our analysis, where the creation itself counts as an evolution.
# minimum_evolutions = 2
# evo_df = evo_df.groupby('jira_issue_id').filter(lambda x: len(x.index) >= minimum_evolutions)

In [None]:
evo_df.shape

In [None]:
evo_df.head(10)

# NLP Techniques for Identifying Entities

# Collect Target Entities: Fields and Field States

We are interested in identifying entities that match issue fields. We want both the field names themselves, as well as
the possible states of those fields. To begin, we will create a complete list of all field names, and every value that
has ever been set to those fields, organised per field. We will segment this analysis on a per-Jira level, but you could
choose any segmentation (or not) of the data based on your level of analysis. Since fields and their available options
are set on a per-Jira level, this is a good starting point.

In [None]:
def collect_fields_and_states():

    # Collect all unique field states
    field_states = defaultdict(dict)
    
    # First, collect a list of all fields in our dataset. This is just the list of unique values in our "fields" column
    all_fields = list(evo_df.field.unique())

    # We don't want to extract the states for certain fields, such as the Summary and Description
    fields_to_ignore_state = ['Summary', 'Description', 'Comments', 'CreatedDate', 'ResolvedDate']
    fields_to_extract_states = [f for f in all_fields if f not in fields_to_ignore_state]

    # The analysis is per-Jira, so we need a list of all Jiras
    all_jiras = list(evo_df.jira.unique())

    # For each field, get all unique states this field has ever been in
    for field in fields_to_extract_states:

        # Reduce the dataset to just the relevant field entries
        evo_field_df = evo_df[evo_df.field == field]

        # Segment the unique field states that are used within each Jira
        for jira in all_jiras:

            # Reduce the dataset to just the relevant jira entries
            evo_jira_field_df = evo_field_df[evo_field_df.jira == jira]

            # Get all unqiue states this field has ever been in: stored in the data_from and data_to columns
            all_states = set(list(evo_jira_field_df.data_from) + list(evo_jira_field_df.data_to))

            # Convert all states to a string. This is required for our comparison to strings later
            all_states = set([str(state) for state in all_states if str(state).strip()])

            # Save all field jira states
            field_states[field][jira] = all_states

        # Now that we have gathered all unique states for this field across all Jiras, we want to create two more sets
        # per field: all_jiras_intersection and all_jiras_union. This allows us to check some other interesting things.
        field_states[field]['all_jiras_intersection'] = set.intersection(*list(field_states[field].values()))
        field_states[field]['all_jiras_union'] = set.union(*list(field_states[field].values()))

    return utils.defaultdict_to_dict(field_states)

field_states = collect_fields_and_states()

In [None]:
# Display the fields and field state counts. There are too many field states to reasonably visualise them.
def display_field_states_counts():

    # First, create a dict of dicts, where each dict represents a single field, and each item is the count within a Jira
    field_states_counts = {}
    for field, field_obj in field_states.items():
        field_states_counts[field] = {}
        for jira, jira_field_obj in field_obj.items():
            field_states_counts[field][jira] = len(jira_field_obj)
    
    # Convert dict of dicts into a dataframe, and display it
    display(pd.DataFrame(field_states_counts))

display_field_states_counts()

In [None]:
pprint(field_states['IssueType']['all_jiras_intersection'])

In [None]:
pprint(field_states['Priority']['all_jiras_intersection'])

In [None]:
pprint(field_states['Status']['all_jiras_intersection'])

In [None]:
pprint(field_states['Resolution']['all_jiras_intersection'])

# Search for Target Entities

Search Method: Text must contain 1) a field name and 2) any field value we found earlier (for that field and Jira)

In [None]:
def get_discussion_analysis_items(evo_df, field_states, fields_to_analyse=None, sample_issue_num=None):

    def save_discussion_item(jira_issue_id, text, field, field_state):
        issue_discussion_items[jira_issue_id][field].append({
            'field_state': field_state,
            'text': text,
        })

    # If not specified, analyse all fields in evo_df
    if not fields_to_analyse:
        fields_to_analyse = list(evo_df.field.unique())

    # We are only analysing the Description and Comments
    # evo_df = evo_df[evo_df.field.isin(['Description', 'Comments'])]
    evo_df = evo_df[evo_df.field.isin(['Comments'])]
    # We are not interested in analysing the "creational" evolutions
    evo_df = evo_df[evo_df.history_order > 0]

    # Get the set of unique issue ids in our evolution dataframe
    jira_issue_ids = list(np.unique(evo_df.jira_issue_id))
    # Shuffle the data, so people running "sample_issue_num" get different ones each time
    random.shuffle(jira_issue_ids)
    
    # Store all identified discussion items
    issue_discussion_items = defaultdict(lambda: defaultdict(list))
    
    # For each issue, check the Description and Comments for the target fields
    # for jira_issue_id in jira_issue_ids:
    for jira_issue_id in tqdm(jira_issue_ids, total=len(jira_issue_ids), ncols=100, ascii=True):

        # Reduce evo_df to just the relevant data
        evo_jira_issue_df = evo_df[evo_df.jira_issue_id == jira_issue_id]

        # Extract the jira of this issue, for future use
        issue_jira = evo_jira_issue_df.iloc[0].jira

        # Analyse every "data_to" text field
        for _, evolution in evo_jira_issue_df.iterrows():

            # Extract the text
            text = evolution.data_to
            if not isinstance(text, str):
                continue  # The text must be a string

            # Check all requested fields
            for field in fields_to_analyse:

                if field not in text:
                    continue  # We didn't find any mention of this field
                
                # We want to check every past field state
                for field_state in field_states[field][issue_jira]:
                    if field_state in text:
                        # Save this item
                        save_discussion_item(jira_issue_id, text, field, field_state)
        
        # Check if we have enough items based on the requested sample
        if sample_issue_num and len(issue_discussion_items) == sample_issue_num:
            break
    
    return utils.defaultdict_to_dict(issue_discussion_items)

issue_discussion_items = get_discussion_analysis_items(
    evo_df, field_states, fields_to_analyse=['Resolution'], sample_issue_num=None)


# IssueType Priority Status Resolution

In [None]:
pprint(issue_discussion_items)

In [None]:
print(f"Number of issues with Discussion Items: {len(issue_discussion_items)}")
print(f"Number of total Discussion Items: {sum([len(n_item) for item in issue_discussion_items.values() for n_item in item.values()])}")

In [None]:
def display_issue_discussion_items(issue_discussion_items, sample_issue_num=None):

    # For time reasons, it may be better to just display a sample of issues
    if sample_issue_num and sample_issue_num < len(issue_discussion_items):
        # Get a random sample of keys
        issues_ambs_found_keys = random.sample(list(issue_discussion_items.keys()), sample_issue_num)
        # Use those keys to build a sample of the original dict
        issue_discussion_items = {key: issue_discussion_items[key] for key in issues_ambs_found_keys}
    
    # Display each group of ambiguities, one issue at a time
    for jira_issue_id, issue_discussion_items in issue_discussion_items.items():
        for field, issue_field_discussion_items in issue_discussion_items.items():
            # Print the identifiers of this discussion item
            print(f"\n{'-'*50} New Discussion Item {'-'*50}\n")
            print(f"{jira_issue_id} {field}")
            pprint(issue_field_discussion_items)


# Display the discussion items found, one issue at a time
display_issue_discussion_items(issue_discussion_items, sample_issue_num=None)

In [None]:
import json

item = "issueType"

# Save the issue_discussion_items to a txt file
with open("./issueDiscussionItems/"+ item +"(JiraEcosystemEvoDF).txt", "w") as file:
    file.write(json.dumps(issue_discussion_items, indent=4))