In [1]:
# Original source code by Phub Namgay @pn17F2DD3, modified by David Johnson @djcomlab

In [2]:
import pandas as pd
from ast import literal_eval
import re
from datetime import datetime, time
from collections import Counter

# Load and clean data

In [3]:
routines_data = pd.read_excel('../data/data.xlsx')
routines_data.shape

(25799, 8)

In [4]:
routines_data["Created at"] = routines_data["Created at"].apply(datetime.fromisoformat)
routines_data["Closed at"] = routines_data["Closed at"].apply(datetime.fromisoformat)

In [5]:
mask_time = routines_data['Duration'].apply(lambda x: isinstance(x, time))
routines_data.loc[mask_time, 'Duration'] = routines_data.loc[mask_time, 'Duration'].apply(lambda x: str(x))
routines_data['Duration'] = pd.to_timedelta(routines_data['Duration'], errors='coerce')

In [6]:
routines_data.Events = routines_data.Events.apply(literal_eval)

In [7]:
routines_data.head()

Unnamed: 0,Number,Issue type,User created,Title,Created at,Closed at,Duration,Events
0,28854,PullRequest,jeremiedbb,CI Update ruff invoke command,2024-04-17 11:39:46+00:00,2024-04-17 14:38:16+00:00,0 days 02:58:30,"[(labeled, jeremiedbb), (labeled, github-actio..."
1,28853,PullRequest,xuefeng-xu,DOC fix description of sample_weight in KBinsD...,2024-04-17 08:13:45+00:00,2024-04-17 10:10:25+00:00,0 days 01:56:40,"[(labeled, github-actions[bot]), (labeled, git..."
2,28850,Issue,alxhslm,Make it possible to specify `monotonic_cst` wi...,2024-04-16 16:37:14+00:00,2024-04-16 17:09:09+00:00,0 days 00:31:55,"[(labeled, alxhslm), (renamed, alxhslm), (comm..."
3,28849,PullRequest,jeremiedbb,MAINT Clean up deprecations for 1.5: in KMeans...,2024-04-16 15:23:31+00:00,2024-04-16 16:19:25+00:00,0 days 00:55:54,"[(labeled, jeremiedbb), (labeled, jeremiedbb),..."
4,28848,PullRequest,jeremiedbb,MAINT Clean up deprecations for 1.5: delayed i...,2024-04-16 14:33:09+00:00,2024-04-16 15:32:39+00:00,0 days 00:59:30,"[(labeled, jeremiedbb), (labeled, jeremiedbb),..."


# Put data into routines data model

In [8]:
# Routine Dynamics model

class Action:
    def __init__(self, name, actor=None):
        self.name = name
        self.actor = actor

class PerformedAction(Action):
    def __init__(self, name, actor, performed_at):
        Action.__init__(self, name, actor)
        self.performed_at = performed_at

class ActionPattern:
    def __init__(self, actions):
        self.actions = actions

class Performance:
    def __init__(self, performed_actions):
        self.performed_actions = performed_actions

class Ostensive:
    def __init__(self, action_pattern):
        self.action_pattern = action_pattern

class Performative:
    def __init__(self, action_pattern, performances):
        self.action_pattern = action_pattern 
        self.performances = performances 

class Routine(Ostensive, Performative):
    def __init__(self, ostensive, performative):
        Ostensive.__init__(self, ostensive.action_pattern)
        Performative.__init__(self, performative.action_pattern, performative.performances)

    def is_live(self):
        pass

    def is_generative(self):
        pass

class Actor:
    def __init__(self, name):
        self.name = name

    def is_intermediary(self):
        pass

    def is_mediator(self):
        pass

class HumanActor(Actor):
    pass

class DigitalActor(Actor):
    pass

In [9]:
performances = []
for i, row in routines_data.iterrows():
    user_created = row["User created"]
    created_at = row["Created at"]
    open_action = PerformedAction("open", user_created, None)
    performed_actions = []
    sequence = row.Events
    for event in sequence:
        action_name = event[0]
        actor = Actor(name=event[1])
        performed_action = PerformedAction(action_name, actor, None)
        performed_actions.append(performed_action)
    #if len(performed_actions) == 0:
    #    print(row)
    performance = Performance(performed_actions)
    performance.duration = row["Closed at"] - row["Created at"]
    performance.issue_type = row['Issue type']
    performances.append(performance)
print(len(performances))

25799


In [10]:
# Number of performed actions
sum([len(x.performed_actions) for x in performances])

617440

In [11]:
# Number of performed actions that are comment actions
sum([len([y for y in x.performed_actions if y.name == "commented"]) for x in performances])

159952

# Construct pattern strings

In [12]:
clean_event_list = list()
events_of_interest = {"labeled", "unlabeled", "assigned", "mentioned", "reviewed", "commented", "closed"}
for sequence in routines_data.Events:
  clean_pattern = ["opened"] + [x[0] for x in sequence if x[0] in events_of_interest]
  clean_event_list.append(clean_pattern)
routines_data["Clean_events"] = clean_event_list

def push_action(action_pattern, new_action_name):
    last_action = action_pattern[-1]
    if last_action.name == new_action_name:
        last_action.cardinality = '+'
    else:
        new_action = Action(name=new_action_name)
        new_action.cardinality = '1'
        action_pattern.append(new_action)
    return action_pattern

action_pattern_list = list()
for i, row in routines_data.iterrows():
  sequence = row.Clean_events
  open_action = Action(name="O")
  open_action.cardinality = '1'
  action_pattern = [open_action]
  for event in sequence:    
    event_label = event
    if event_label == "labeled":
      action_pattern = push_action(action_pattern, 'L')        
    if event_label == "unlabeled":
      action_pattern = push_action(action_pattern, 'U')
    if event_label == "assigned":
      action_pattern = push_action(action_pattern, 'A')      
    if event_label == "mentioned":
      action_pattern = push_action(action_pattern, 'M')      
    if event_label == "reviewed":
      action_pattern = push_action(action_pattern, 'RV')      
    if event_label == "commented":
      action_pattern = push_action(action_pattern, 'C')
    if event_label == "closed":
        close_action = Action(name="CL")
        close_action.cardinality = '1'
        action_pattern.append(close_action)
        break
  action_pattern_list.append(action_pattern)

In [13]:
seen = set()
unique_action_patterns = []
for action_pattern in action_pattern_list:
    current_action_pattern_tuple = tuple((action.name, action.cardinality) for action in action_pattern)
    if current_action_pattern_tuple not in seen:
        seen.add(current_action_pattern_tuple)
        unique_action_patterns.append(list(current_action_pattern_tuple))
print("Unique patterns found (using objs): ", len(unique_action_patterns))

Unique patterns found (using objs):  8474


In [14]:
def action_pattern_str(action_pattern):
    action_pattern_tuples = [(action.name, action.cardinality) for action in action_pattern]
    return '/'.join([f"{name}{cardinality if cardinality != '1' else ''}" for name, cardinality in action_pattern_tuples])
action_pattern_str_list = [action_pattern_str(action_pattern) for action_pattern in action_pattern_list]

In [15]:
print("Unique patterns found (using strs): ", len(set(action_pattern_str_list)))  # quick way to find unique patterns

Unique patterns found (using strs):  8474


# Get stats about 10 top patterns

In [16]:
string_counts = Counter(action_pattern_str_list)
top_10 = string_counts.most_common(10)
print(top_10)

[('O/C+/CL', 2378), ('O/CL', 1980), ('O/C/CL', 1881), ('O/L/C/CL', 459), ('O/RV+/CL', 449), ('O/RV/CL', 401), ('O/L+/RV/CL', 336), ('O/L+/RV+/CL', 321), ('O/C+/M/C+/CL', 314), ('O/L/C+/CL', 308)]


In [17]:
routines_data["Clean_patterns"] = action_pattern_str_list

In [18]:
routines_data.head()

Unnamed: 0,Number,Issue type,User created,Title,Created at,Closed at,Duration,Events,Clean_events,Clean_patterns
0,28854,PullRequest,jeremiedbb,CI Update ruff invoke command,2024-04-17 11:39:46+00:00,2024-04-17 14:38:16+00:00,0 days 02:58:30,"[(labeled, jeremiedbb), (labeled, github-actio...","[opened, labeled, labeled, commented, reviewed...",O/L+/C/RV/CL
1,28853,PullRequest,xuefeng-xu,DOC fix description of sample_weight in KBinsD...,2024-04-17 08:13:45+00:00,2024-04-17 10:10:25+00:00,0 days 01:56:40,"[(labeled, github-actions[bot]), (labeled, git...","[opened, labeled, labeled, commented, reviewed...",O/L+/C/RV+/CL
2,28850,Issue,alxhslm,Make it possible to specify `monotonic_cst` wi...,2024-04-16 16:37:14+00:00,2024-04-16 17:09:09+00:00,0 days 00:31:55,"[(labeled, alxhslm), (renamed, alxhslm), (comm...","[opened, labeled, commented, closed, mentioned...",O/L/C/CL
3,28849,PullRequest,jeremiedbb,MAINT Clean up deprecations for 1.5: in KMeans...,2024-04-16 15:23:31+00:00,2024-04-16 16:19:25+00:00,0 days 00:55:54,"[(labeled, jeremiedbb), (labeled, jeremiedbb),...","[opened, labeled, labeled, labeled, commented,...",O/L+/C/RV/CL
4,28848,PullRequest,jeremiedbb,MAINT Clean up deprecations for 1.5: delayed i...,2024-04-16 14:33:09+00:00,2024-04-16 15:32:39+00:00,0 days 00:59:30,"[(labeled, jeremiedbb), (labeled, jeremiedbb),...","[opened, labeled, labeled, labeled, commented,...",O/L+/C/RV/CL


In [19]:
# get table and output to an excel file
summary_statistics = pd.DataFrame()

# counters for events, comments, actors
events_count_list = list()
comment_count_list = list()
actor_count_list = list()
mean_duration_list = list()
for pat in top_10:
  events_counter = 0
  comment_counter = 0
  actor_collector = set()
  pat_string = pat[0]
  for row in routines_data[routines_data.Clean_patterns == pat_string].Events:
    events_counter+=len(row)
    comment_counter+=len([x for x in row if x[0] == "commented"])  # num comments are accumulating at every run
    actor_collector.update({x[1] for x in row})
  events_count_list.append(events_counter)
  comment_count_list.append(comment_counter)
  actor_count_list.append(len(actor_collector))
  
  # get mean routine duration 
  mean_duration = routines_data[routines_data.Clean_patterns == pat_string].Duration.mean()
  mean_duration_list.append(mean_duration)

summary_statistics['Recurrent action pattern'] = [t[0] for t in top_10]
summary_statistics['No. events'] = events_count_list
summary_statistics['No. comments'] = comment_count_list
summary_statistics['No. unique actors'] = actor_count_list
summary_statistics['Avg. duration'] = mean_duration_list
summary_statistics.set_index('Recurrent action pattern', inplace=True)
summary_statistics

Unnamed: 0_level_0,No. events,No. comments,No. unique actors,Avg. duration
Recurrent action pattern,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
O/C+/CL,26007,11764,1735,105 days 12:33:44.697645080
O/CL,14000,2308,1166,24 days 10:57:53.171212121
O/C/CL,13439,3442,1113,55 days 04:41:07.063795854
O/L/C/CL,3636,676,391,78 days 17:36:53.028322440
O/RV+/CL,5084,189,260,4 days 07:45:16.561247216
O/RV/CL,3246,203,295,3 days 02:17:54.331670822
O/L+/RV/CL,3739,114,184,4 days 14:45:15.059523809
O/L+/RV+/CL,4544,72,149,5 days 13:57:32.423676012
O/C+/M/C+/CL,6462,3060,408,165 days 13:57:50.372611464
O/L/C+/CL,3203,1214,405,76 days 09:19:21.717532468


In [20]:
summary_statistics.to_excel('../data/summary.xlsx')

# Calculate state transition matrix

In [21]:
events_of_interest = {"labeled", "unlabeled", "assigned", "mentioned", "reviewed", "commented", "closed"}
state_map = dict()
for performance in performances:
    other_states = list()
    for x in performance.performed_actions:
        if x.name in events_of_interest:
            other_states.append(x.name)
    open_to_closed_states = ['opened'] + other_states
    for i, state in enumerate(open_to_closed_states):
        try:
            next_state = open_to_closed_states[i+1]
            #print(state, next_state)
            try:
                current_count = state_map[(state, next_state)]
            except KeyError:
                current_count = 0
                state_map[(state, next_state)] = current_count
            state_map[(state, next_state)] = current_count + 1
        except IndexError:
            pass 

In [22]:
def calc_transition_probabilities(source_state):
    transitions = [x for x in state_map.items() if x[0][0] == source_state]
    sum_num_transitions = sum([x[1] for x in transitions])
    for transition in transitions:
        yield(transition[0][1], (transition[1] / sum_num_transitions))

In [23]:
states_of_interest = sorted({'opened'}.union(events_of_interest))  #sort in alphabetical order
row_dict = dict()
for source_state in states_of_interest:
    col_dict = dict()
    for state in states_of_interest:
        col_dict[state] = 0.0
    for transition, probability in calc_transition_probabilities(source_state):
        col_dict[transition] = probability
    row_dict[source_state] = col_dict
transition_matrix = pd.DataFrame(row_dict).transpose()
transition_matrix

Unnamed: 0,assigned,closed,commented,labeled,mentioned,opened,reviewed,unlabeled
assigned,0.012367,0.166078,0.443463,0.060071,0.003534,0.0,0.155477,0.159011
closed,0.0,0.008917,0.869108,0.019533,0.082059,0.0,0.01242,0.007962
commented,0.001797,0.086438,0.544359,0.017717,0.29257,0.0,0.05047,0.006649
labeled,0.002381,0.056339,0.375505,0.315886,0.010244,0.0,0.200647,0.038997
mentioned,0.00132,0.048591,0.583516,0.031652,0.237378,0.0,0.089363,0.008181
opened,0.001866,0.076992,0.421006,0.301824,0.102306,0.0,0.090874,0.005133
reviewed,0.00259,0.187991,0.343702,0.046132,0.024,0.0,0.38869,0.006896
unlabeled,0.004864,0.130026,0.337549,0.401427,0.003567,0.0,0.042153,0.080415


In [24]:
transition_matrix.to_excel('../data/transition_matrix.xlsx')