In [1]:
from jira import JIRA
import requests
import json
import pandas as pd
import numpy as np
import pyarrow
import datetime

# Processamento paralelo dividido por meses

Para acelerar o código abaixo, foi necessário usar a lib threading e quebrar as consultas via JQL em períodos mensais.

In [2]:
import threading
import time
  
  
# A sample function to print squares
def extrair_dados(inicio,fim): #YYYY-mm-dd
    # Conexão ao servidor Atlassian
    jiraOptions = {'server': "https://olxbr.atlassian.net"} 
    jira = JIRA(options=jiraOptions, basic_auth=(
        "Your email", "Your key"))
    #Filtro JQL utilizado com variáveis
    jql_str =f''' 
                Status != Cancelado
                AND issuetype in (bug,story,"tech value",toil, opportunity,Hypothesis)
                AND created >= {inicio} AND created <= {fim} 
                AND category in ("BU_ZAP+",BU_OLX,CROSS)
                order by created DESC'''
    data = jira.search_issues(jql_str=jql_str,maxResults = 1, json_result=True)
    total = data['total']
    cont = 0
    list_issues = []
    while cont <= total:
        data = jira.search_issues(jql_str=jql_str,maxResults = 100, json_result=True,startAt=cont,expand='changelog')
        list_issues.append(data['issues'])
        cont +=100
    cont = 0
    issues = []
    while cont < len(list_issues):
        issues += list_issues[cont]
        cont +=1

    #Formando o Dataframe    
    id_issue = []
    link = []
    for n in range(len(issues)):
        id_issue.append(issues[n]['id'])
        if len(issues[n]['fields']['issuelinks']) > 0:    
            for c in range(len(issues[n]['fields']['issuelinks'])):
                try:
                    link.append(issues[n]['fields']['issuelinks'][c]['outwardIssue']['key'] + '| ')
                except:
                    link.append(issues[n]['fields']['issuelinks'][c]['inwardIssue']['key'] + '| ')
        else:
            link.append(None)
    links = pd.DataFrame((zip(id_issue, link)), columns = ['id', 'links'])

    df = pd.json_normalize(issues)[['id','fields.status.name','key','fields.created',
                                       'fields.resolutiondate','fields.issuetype.name','fields.project.name',
                                       'fields.project.key','fields.project.projectCategory.name',
                                       'fields.priority.name','fields.resolution.name','fields.labels',
                                       'fields.parent.id']]
    df = pd.merge(df,links, how='left', on='id')

    df.rename(columns={'id': 'issue_id','fields.status.name':'issue_status','key':'issue_key',
                                'fields.created':'created_date','fields.resolutiondate':'resolution_date',
                                'fields.issuetype.name':'issue_type','fields.project.name':'name_project',
                                'fields.project.key':'key_project','fields.project.projectCategory.name':'bu',
                                'issue.fields.priority.name':'priority','issue.fields.resolution.name':'resolution',
                                'fields.labels':'labels','fields.parent.id':'parent_id'},inplace = True)
    
    #Incluir datas de updates no dataframe
    #Fonte https://www.jitsejan.com/getting-lead-time-for-jira-tickets
    transitions = pd.json_normalize(data=issues,
                                record_path=['changelog', 'histories'],
                                meta=['fields', 'key'])[['created', 'items', 'key']]
    transitions = transitions\
    .join(transitions['items']\
            .explode()
            .apply(pd.Series)
    )\
    .query("field == 'status'")\
    .drop("items", axis=1)[['created', 'key', 'toString']]
    
    status = transitions.pivot_table(index='key', values='created', columns='toString',aggfunc='min',)
    status.reset_index(inplace=True)    
    try:
        cycle_time = status[['key','Selected for Discovery','Discovery Done','in progress','Done']]
    except Exception:
        cycle_time = status[['key','Selected for Discovery','in progress','Done']]
    try:    
        cycle_time.rename(columns={'key':'issue_key','Selected for Discovery':'selected_for_discovery',
                               'Discovery Done':'discovery_done',
                               'in progress':'in_progress',
                               'Done':'done'},inplace=True)
    except Exception:
        cycle_time.rename(columns={'key':'issue_key','Selected for Discovery':'selected_for_discovery',
                               'in progress':'in_progress',
                               'Done':'done'},inplace=True)     

    
    df = pd.merge(df,cycle_time, how='left',on='issue_key')
    
    
    
    df.to_csv(f'tabela_{inicio}_{fim}.csv',index=False)
    
          
#-------------------------------------------------------------------------------------------------------------------
def processamento_paralelo():
    thread1 = threading.Thread(target=extrair_dados, 
                               args=(['2022-01-01','2022-02-28'])) #YYYY-mm-dd

    thread2 = threading.Thread(target=extrair_dados, 
                               args=(['2022-03-01','2022-04-30'])) #YYYY-mm-dd

    thread3 = threading.Thread(target=extrair_dados, 
                               args=(['2022-05-01','-0d'])) #YYYY-mm-dd
#     thread4 = threading.Thread(target=extrair_dados, 
#                                args=(['2022-07-01','-0d'])) #YYYY-mm-dd

    thread1.start()
    thread2.start()
    thread3.start()
#     thread4.start()



    return thread1.join(),thread2.join(),thread3.join()#,thread4.join()

In [3]:
from time import sleep, time
start = time()
processamento_paralelo()
end = time()
total_time = end-start
print(f'Tempo de espera: {round(total_time/60)} minutos')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Tempo de espera: 8 minutos


In [4]:
b1 = pd.read_csv('tabela_2022-01-01_2022-02-28.csv')
b2 = pd.read_csv('tabela_2022-03-01_2022-04-30.csv')
b3 = pd.read_csv('tabela_2022-05-01_-0d.csv')
# b4 = pd.read_csv('tabela_2022-07-01_-0d.csv')

In [5]:
df = pd.concat([b1, b2,b3])

In [6]:
df['created_date'] = pd.to_datetime(df['created_date']).dt.date
df['resolution_date'] = pd.to_datetime(df['resolution_date']).dt.date
df['selected_for_discovery'] = pd.to_datetime(df['selected_for_discovery']).dt.date
df['discovery_done'] = pd.to_datetime(df['discovery_done']).dt.date
df['in_progress'] = pd.to_datetime(df['in_progress']).dt.date
df['done'] = pd.to_datetime(df['done']).dt.date
dt = datetime.date.today()
df['dt'] = dt

In [7]:
df.head()

Unnamed: 0,issue_id,issue_status,issue_key,created_date,resolution_date,issue_type,name_project,key_project,bu,fields.priority.name,fields.resolution.name,labels,parent_id,links,selected_for_discovery,discovery_done,in_progress,done,dt
0,86618,Concluído,SL-441,2022-02-25,NaT,História,Squad Listings,SL,BU_ZAP+,Medium,,['backend'],,,NaT,NaT,NaT,2022-03-03,2022-07-27
1,86608,New Item,TXP-1274,2022-02-25,NaT,Tech Value,Transaction Experience,TXP,BU_OLX,Medium,,[],,,NaT,NaT,NaT,NaT,2022-07-27
2,86606,Concluído,DISPUTA-620,2022-02-25,2022-02-25,Bug,Disputes,DISPUTA,BU_OLX,Crítico,Itens concluídos,[],,,NaT,NaT,NaT,2022-02-25,2022-07-27
3,86585,Concluído,OCSD-31,2022-02-25,2022-03-03,História,OLX - Compra Segura - Delivery,OCSD,BU_OLX,Medium,Itens concluídos,[],,,NaT,NaT,NaT,2022-03-03,2022-07-27
4,86584,New Item,TRACKEXP-1695,2022-02-25,NaT,História,Tracking & Experimentation,TRACKEXP,CROSS,Medium,,[],38389.0,,NaT,NaT,NaT,NaT,2022-07-27


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13307 entries, 0 to 6712
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   issue_id                13307 non-null  int64  
 1   issue_status            13307 non-null  object 
 2   issue_key               13307 non-null  object 
 3   created_date            13307 non-null  object 
 4   resolution_date         8562 non-null   object 
 5   issue_type              13307 non-null  object 
 6   name_project            13307 non-null  object 
 7   key_project             13307 non-null  object 
 8   bu                      13307 non-null  object 
 9   fields.priority.name    13297 non-null  object 
 10  fields.resolution.name  8562 non-null   object 
 11  labels                  13307 non-null  object 
 12  parent_id               6407 non-null   float64
 13  links                   2382 non-null   object 
 14  selected_for_discovery  70 non-null    

In [9]:
df.to_parquet('jira_issue.parquet',index=False)