# Exploratory Dataset Analysis

In [64]:
import pandas as pd
import numpy as np
import nltk

import utils

### functions:

In [65]:
def addIssueTypeToField(df, field):
    # reduces the dataframe to only the field you are looking for with the addition of a new column with the issue type
    # needs the full dataframe and the field as string
    
    df_reduced = df.loc[(df["field"] == field) | (df["field"] == "IssueType")]

    issue_type_df = df_reduced[df_reduced["field"] == "IssueType"]

    issue_type_dict = dict(zip(issue_type_df['issue_id'], issue_type_df['data_to']))

    df_reduced['issue_type'] = df_reduced['issue_id'].map(issue_type_dict)

    output = df_reduced[df_reduced["field"] == field]

    return output

In [66]:
def create_ticket(df):
       headers = ['IssueId', 'EvoId', 'Summary', 'Description', 'VersionsAffected', 'IssueType',
       'Project', 'Components', 'CreatedDate', 'ResolvedDate', 'Status',
       'Priority', 'Creator', 'Reporter', 'Comments', 'Resolution',
       'IssueLinks', 'Labels', 'Environment', 'VersionsFixed', 'Assignee',
       'TimeEstimateOriginal', 'TimeEstimateRemaining', 'Rank', 'Parent',
       'Sprint', 'TimeSpent', 'Flagged']

       ticket_df = pd.DataFrame(columns=headers)
       
       data_to_map = {}
       for index, row in df.iterrows():
              data_to_map['IssueId'] = row['issue_id']
              data_to_map['EvoId'] = row['history_order']
              data_to_map['HistoryAuthor'] = row['history_author']
              data_to_map['Updated'] = row['history_created_date']
              field = row['field']
              data_to_map[field] = row['data_to']

              sample_df = pd.DataFrame(data_to_map, index=[0])
              ticket_df = pd.concat([ticket_df, sample_df], axis=0)

       return ticket_df

In [67]:
def createCommentsHistory(data):
    #data is the content of the Comments field of a ticket
    comments = {}
    output = pd.DataFrame(columns=['Author', 'Created', 'Comment'])
    
    ind = 0
    for index, row in data.iterrows():
        comments['Author'] = row['history_author']
        comments['Created'] = row['history_created_date']
        comments['Comment'] = row['data_to']
        comments_df = pd.DataFrame(comments, index=[ind])
        output = pd.concat([output, comments_df])
        ind += 1

    return output

In [68]:
def createLastRowWithCommentSection(last_row, sample_df):
    row = last_row.filter(['Summary','Description','VersionsAffected','IssueType','Project','Components','CreatedDate','ResolvedDate','Status','Priority','Creator','Reporter','Resolution','IssueLinks','Labels','VersionsFixed','Assignee','TimeSpent'], axis=1).iloc[[-1]]
    comment_history = createCommentsHistory(sample_df[sample_df['field'] == 'Comments'])
    row.at[:, 'Comments'] = [comment_history]

    return row

--------

In [48]:
df = pd.read_csv('./data/csv/output.csv')

Not relevant fields: 
Assignee, CreatedDaten, Creator, Environment, IssueLinks, Parent, Rank, Reporter, ResolvedDate, Sprint, TimeEstimateOriginal, TimeEstimateRemaining, TimeSpent, VersionsAffected, VersionsFixed

Fields with context information: 
Description, IssueType, Status, Summary, Comments, Priority, Project, Resolution, (Components, Flagged, Labels)

Relevant fields for completion: 
IssueType, Status, Summary, (Description, Labels)

### search for description structure

In [69]:
new_df = addIssueTypeToField(df, 'Description')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced['issue_type'] = df_reduced['issue_id'].map(issue_type_dict)


In [70]:
new_bug_df = new_df[new_df['issue_type'] == 'Bug']

In [71]:
subset =new_bug_df[new_bug_df["data_to"].str.len() > 450]

In [72]:
subset.head(10)

Unnamed: 0.1,Unnamed: 0,jira,issue_id,history_order,field,field_evo_order,field_evo_first,field_evo_last,data_from,data_to,...,last_reporter,last_assignee,last_commenter,last_evolver,prev_creators,prev_reporters,prev_assignees,prev_commenters,prev_evolvers,issue_type
69,18323858,Jira,1788469,0,Description,0,True,True,,"once when I was using Sourcetree v2.1.11, ever...",...,,,,,[],[],[],[],[],Bug
110,18323899,Jira,1751607,0,Description,0,True,True,,I have installed putty and pageant outside of ...,...,,,,,[],[],[],[],[],Bug
129,18323918,Jira,1750632,0,Description,0,True,True,,*{color:#172b4d}Summary:{color}*\r\n\r\n \r\n\...,...,,,,,[],[],[],[],[],Bug
162,18323951,Jira,1747580,0,Description,0,True,True,,*Issue Summary:*\r\n\r\nWhen created a new use...,...,,,,,[],[],[],[],[],Bug
179,18323968,Jira,1747292,0,Description,0,True,True,,it is not possible to push if force push flag ...,...,,,,,[],[],[],[],[],Bug
266,18324055,Jira,1740957,0,Description,0,True,True,,Every time I push a new branch for the first t...,...,,,,,[],[],[],[],[],Bug
341,18324130,Jira,1738272,0,Description,0,True,True,,Version: ??? - 3.4.3\r\n\r\nWindows 10 Pro\r\n...,...,,,,,[],[],[],[],[],Bug
359,18324148,Jira,1737590,0,Description,0,True,True,,when i try to merge the master branch into my ...,...,,,,,[],[],[],[],[],Bug
378,18324167,Jira,1737043,0,Description,0,True,True,,I have setup Sourcetree so that it does not st...,...,,,,,[],[],[],[],[],Bug
421,18324210,Jira,1726709,0,Description,0,True,True,,"Being prompted to ""Run 'git lfs pull' now?"" de...",...,,,,,[],[],[],[],[],Bug


In [74]:
bug_id = 1747292
desc_struc_sample = df[df["issue_id"] == bug_id]

### Normal ticket

In [75]:
sample = df[df["issue_id"] == 1803818]

In [76]:
sample_cut = sample.filter(['issue_id', 'history_order', 'history_author', 'history_created_date', 'field', 'data_to'], axis=1)
desc_struc_sample_cut = desc_struc_sample.filter(['issue_id', 'history_order', 'history_author', 'history_created_date', 'field', 'data_to'], axis=1)

In [77]:
sample_ticket = create_ticket(sample)
desc_struc_sample_ticket = create_ticket(desc_struc_sample)

In [78]:
sample_ticket.to_csv("data/csv/unprocessed/sample_ticket.csv")
desc_struc_sample_ticket.to_csv("data/csv/unprocessed/desc_struc_sample_ticket.csv")

In [79]:
sample_ticket_with_singel_evoId = sample_ticket.groupby(['EvoId']).last()
desc_struc_sample_ticket_with_singel_evoId = desc_struc_sample_ticket.groupby(['EvoId']).last()

In [80]:
sample_ticket_with_singel_evoId.to_csv("data/csv/ticket/sample_ticket.csv")
desc_struc_sample_ticket_with_singel_evoId.to_csv("data/csv/ticket/desc_struc_sample_ticket.csv")

### Create JSON with Comment Section

In [81]:
from json import loads, dumps

sample = pd.read_csv("data/csv/ticket/sample_ticket.csv")
desc_struc_sample = pd.read_csv("data/csv/ticket/desc_struc_sample_ticket.csv")

In [82]:
last_row = createLastRowWithCommentSection(sample, sample_cut)
desc_struc_last_row = createLastRowWithCommentSection(desc_struc_sample, desc_struc_sample_cut)

last_row.to_csv("data/csv/last_row/last_row.csv")
last_row.to_json("data/json/last_row/last_row.json", orient="records", lines=True, indent=4)

desc_struc_last_row.to_csv("data/csv/last_row/desc_struc_last_row.csv")
desc_struc_last_row.to_json("data/json/last_row/desc_struc_last_row.json", orient="records", lines=True, indent=4)

In [83]:
# save JSON for long term
desc_struc_last_row.to_json("data/json/tickets/bugs/id_"+str(bug_id)+".json", orient="records", lines=True, indent=4)

In [87]:
str = desc_struc_last_row["Description"].iloc[0]

In [90]:
len(str)

3008