<a href="https://colab.research.google.com/github/SRI-CSL/signal-public/blob/main/colabs/sequence_alignment_of_developer_behavior.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **SIGNAL**ing Sequence Alignment of Developer Behavior

**Description:** Create sequences of developer activities and code contribution data within a given time frame.

##### **Copyright 2022 SRI International.**

This project is under the GPL3 License. 
See the [LICENSE](https://www.gnu.org/licenses/gpl-3.0.en.html) file for the full license text.

## &#9776; Dependencies

In [1]:
import os
import sys
import time
import warnings

import json
import pickle
import pathlib
import zipfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statistics import mean, median
from datetime import date, datetime

from contextlib import contextmanager
from scipy.spatial.distance import cdist, pdist, squareform

import typing as ty
from typing import List, Any, Dict, Tuple

In [2]:
try:
    from google.colab import data_table, output
    data_table.disable_dataframe_formatter()
    output.enable_custom_widget_manager()
except Exception:
    print("Launched notebook locally")

In [3]:
# install gdown library for .csv files download
try:
    import gdown
except ImportError:
    !pip install gdown

In [4]:
try:
    from dataclass_csv import DataclassReader, DataclassWriter
except ImportError:
    !pip install dataclass-csv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dataclass-csv
  Downloading dataclass_csv-1.4.0-py2.py3-none-any.whl (16 kB)
Installing collected packages: dataclass-csv
Successfully installed dataclass-csv-1.4.0


In [5]:
from dataclass_csv import DataclassReader, DataclassWriter, dateformat
from dataclasses import asdict, dataclass, field, replace

In [6]:
import plotly.graph_objects as go
import plotly.express as px

## &#9997; Configuration



In [7]:
# warnings.simplefilter("ignore", np.ComplexWarning)
warnings.filterwarnings("ignore") 

In [8]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [9]:
# origins of the .csv data files used
# True: originating within the signal-public GitHub repository
# False: need to be downloaded from a provided URL, especially useful if running on Colab
IS_LOCAL_FILE = False

In [10]:
# Save the results produced in this Colab Notebook
# todo: (also) enable saving in Google Drive
SAVE_RESULTS = True

In [11]:
# dictionary of files for this notebook to work
# the dictionary is composed of (filename, Google ID) key-value pairs

# major code landmarks .zip files for the 
# 1. Linux Kernel: (08-15-2020.zip) https://drive.google.com/file/d/1h1AGfQkOhvgtcCR8tWVzObBSpBTURWM2/view?usp=sharing
DATA_DICT = {
    "linux-kernel-data": "1h1AGfQkOhvgtcCR8tWVzObBSpBTURWM2",
    "activity_triplets_V1_02182022": "1BUd2sAUP04Jf1Qnin0uhMeAXaN3gCyTy" 
    }

In [12]:
TODAYS_DATE = date.today()
print(TODAYS_DATE)

2022-09-14


## &#9881; `Helper` functions

### &#9759; Common Functions

In [13]:
# thx to https://stackoverflow.com/questions/53581278
def is_run_in_colab():
    if 'google.colab' in os.environ['PATH']:
        return True
    elif hasattr(__builtins__,'__IPYTHON__'):
        from IPython import get_ipython
        return 'google.colab' in str(get_ipython())
    return False

In [14]:
def path_exists(input_path: str) -> bool:
    return os.path.exists(input_path)

In [15]:
def check_file_status(input_path: str):
    if path_exists(input_path=input_path):
        print(f"- File {input_path.split('/')[-1]} exists locally at {input_path}!")
    else:
        if IS_LOCAL_FILE:
            print(f"- IS_LOCAL_FILE is set to {IS_LOCAL_FILE}. The file is accessed via a public GitHub link!")
        else:
            print(f"- File {input_path.split('/')[-1]} does not exists locally!")

In [16]:
def download_data_from_google_drive(google_file_id: str, output_file_name: str, quiet_download: bool) -> str:
    file_path = f'./{output_file_name}'

    if not os.path.exists(file_path):
        gdown.download(id=google_file_id, output=output_file_name, quiet=quiet_download)
    else:
        print(f"{output_file_name} already exists!")
        
    return file_path

In [17]:
def get_dataset(dataset_name: str, is_local_file: bool) -> str:
    file_path = f'./{dataset_name}'

    if is_local_file:
        file_path = f'https://raw.githubusercontent.com/SRI-CSL/signal-public/main/data/{dataset_name}'
    else:
        if dataset_name in DATA_DICT:
            file_path = download_data_from_google_drive(google_file_id=DATA_DICT[dataset_name], output_file_name=dataset_name, quiet_download=False)
        else:
            print(f"{dataset_name} is not present in dataset dictionary! Please ensure the file name is correct!")
            return

    return file_path

### &#9759; Tensor Fusion Functions

In [18]:
def tensor_fusion(h_x: np.array, h_y: np.array) -> np.ndarray:
    """Computes the tensor fusion based on the recommendation of 
    Morency et al., Tutorial on Multimodal Machine Learning
    
    - current implementation: Assumes 1-D vectors!
    """

    # add 1 to h_x and h_1 at the end and beginning, respectively
    h_x = np.append(h_x, 1)
    h_y = np.concatenate(([1], h_y))

    # add new axis to h_x and h_y
    ## if h_x is (N, ), convert it to (N, 1)
    h_x_1 = np.expand_dims(h_x, axis=1)

    ## if h_y is (M, ), convert it to (1, M)
    h_y_1 = np.expand_dims(h_y, axis=0)

    # compute the Kronecker product on h_x_1 and h_y_1
    K_xy = np.kron(h_x_1, h_y_1)

    return K_xy

In [19]:
def vector_fusion_3D(x: np.array, y: np.array, z: np.array) -> np.ndarray:
    
    # add 1 at the beginning of each 1-D vector
    h_x = np.concatenate((np.ones(1), x), axis=0)
    h_y = np.concatenate((np.ones(1), y), axis=0)
    h_z = np.concatenate((np.ones(1), z), axis=0)

    # add additional axis needed for 3D vector fusion
    ## (x_shape, 1, 1)
    h_x = h_x[..., np.newaxis, np.newaxis]

    ## (1, y_shape, 1)
    h_y = h_y[np.newaxis, ..., np.newaxis]

    ## (1, 1, z_shape)
    h_z = h_z[np.newaxis, np.newaxis, ...]

    # Kronecker product, equivalent to h_x * h_y * h_z
    h_m = np.kron(h_x, np.kron(h_y, h_z))

    return h_m

### &#9759; DataFrame Manipulation Functions

In [20]:
def select_columns_from_dataframe(input_df: pd.DataFrame, columns: List) -> pd.DataFrame:
    """Selects a subset of columns from a given input dataframe.

    Args:
      input_df: input dataframe
      columns: subset of columns to select

    Returns:
      The resulting subset dataframe

    Raises:
      LookupError: If columns do not exist in the input_df 
    """

    # check if columns exist in the input dataframe
    if set(columns).issubset(input_df.columns):
        result_df = input_df[columns]
    else:
        raise LookupError(f"Input columns: {columns} do not exist in the dataframe!")
    
    return result_df

In [21]:
def create_sequence(input_df: pd.DataFrame, seq_columns: Tuple, idx_column: str) -> Dict[str, List[Tuple[np.datetime64, str]]]:
    """Creates a sequence of tuples from a given input dataframe.

    Args:
      input_df: input dataframe
      seq_columns: names of columns values of which will be present in the tuple
      idx_column: column values of which will serve as dictionary keys

    Returns:
      The resulting dictionary where the target sequence of tuples is generated 
      for each index key value.
    """

    result_dict = {}

    for idx, row in input_df.iterrows():
        if row[idx_column] in result_dict:
            # target index column exists
            result_dict[row[idx_column]].append((row[seq_columns[0]], row[seq_columns[1]]))
        else:
            # create the new key
            result_dict[row[idx_column]] = [(row[seq_columns[0]], row[seq_columns[1]])]

    return result_dict

In [22]:
def get_records_in_time_window(input_df: pd.DataFrame, time_column: str, start_time: np.datetime64, end_time: np.datetime64) -> pd.DataFrame:
    """Obtain dataframe records that fall within a user-defined time window

    Args:
      input_df: input dataframe
      time_column: dataframe column storing the time data serving as search filter
      start_time: time window start time
      end_time: time window end time

    Returns:
      The resulting subset dataframe
    """

    assert start_time <= end_time

    tmp_df = input_df.loc[(input_df[time_column] >= start_time) & (input_df[time_column] <= end_time)]

    return tmp_df

In [23]:
def get_records(input_df: pd.DataFrame, condition_column: str, condition_value: str) -> pd.DataFrame:
    """Obtain dataframe records based on single condition

    Args:
      input_df: input dataframe
      condition_column: input_df column to apply the condition to
      condition_value: value to condition with

    Returns:
      The resulting subset dataframe
    """

    assert condition_column in input_df.columns

    tmp_df = input_df.loc[input_df[condition_column] == condition_value]

    return tmp_df

In [24]:
def remove_empty_rows(input_array: np.ndarray) -> np.ndarray:
    """Remove all rows that are composed of all zeroes.

    Args:
      input_array: ndarray to sanitize

    Returns:
      The resulting array after removal of all zero rows.
    """

    input_array = input_array[~np.all(input_array == 0, axis=1)]

    return input_array

In [25]:
def get_developer_data(input_df: pd.DataFrame, dev_id: int) -> np.ndarray:
    """Obtain all records of a developer within a time window (time period)

    Args:
      input_df: input dataframe
      dev_id: unique id of developer
    
    Returns:
      A numpy ndarray containing a list of all the fusion vectors of a given developer
    """

    res_dict = {}

    dev_seq = input_df.loc[input_df['sender_id'] == dev_id]

    res_dict['fusion'] = np.array([i for i in dev_seq.fusion_vector])
    # res_dict['activities'] = np.array([i for i in dev_seq.activities_vector])
    res_dict['hashes'] = np.array([i[0] for i in dev_seq.hashes_vector])
    #np.array([i for i in dev_seq.hashes_vector])

    return res_dict

In [26]:
def developer_sequences(input_df: pd.DataFrame, start_period: int, end_period: int) -> Dict[int, np.ndarray]:
    """Produce all developer fusion sequences

    Args:
      input_df: input dataframe
      start_period: starting week of the year in consideration, periods are represented in 'weekofyear'
      end_period: ending week of the period in consideration

    Returns:
      A dictionary where the keys are the unique developers and the values are numpy ndarrays containing the sequence of 
    """

    result_dict = {}

    assert start_period <= end_period

    # 1. obtain the subset of the dataframe with records within the desired period
    tmp_df = input_df.loc[(input_df['start_time'].dt.isocalendar().week >= start_period) & 
                          (input_df['end_time'].dt.isocalendar().week <= end_period)]

    if tmp_df.shape[0] > 0:
        # 2. obtain a sequence of the unique developers in the input_df
        devs = tmp_df.sender_id.unique()

        for dev_id in devs:
            if dev_id not in result_dict:
                result_dict[dev_id] = get_developer_data(input_df=tmp_df, dev_id=dev_id)

    return result_dict

In [27]:
def get_revision_history_data(input_df: pd.DataFrame, year: int, start_period: int, end_period: int, hashes: List) -> pd.DataFrame:
    """Obtain a subset of the revision history dataframe corresponding
       to the given time period and the provided list of hashes.

    Args:
      input_df: input dataframe, i.e., df_revision_history
      year: year in consideration
      start_period: starting week of the year in consideration
      end_period: ending week of the year in consideration
      hashes: unique hashes list

    Returns:
      A subset of the input dataframe satisfying the conditions.
    """

    # todo: assert that the period are positive numbers between [0, 53]
    assert start_period <= end_period

    # there should be at least one hash to search for
    assert len(hashes) > 0

    tmp_df = input_df.loc[
        (input_df['timeline'].dt.year == year) & 
        (input_df['timeline'].dt.isocalendar().week >= start_period) & 
        (input_df['timeline'].dt.isocalendar().week <= end_period) & 
        (input_df['commit_hash'].isin(hashes))]
    
    return tmp_df

### &#9759; Developer Behavior Functions

In [28]:
# Source: @Huascar, https://github.com/SRI-CSL/SIGNAL/blob/rev-history-mining/signal/rev-history-mining/git_landmark.py

def sim(x: np.ndarray, y: np.ndarray) -> float:
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def rev_pred(r_i: np.ndarray, alpha: float = 0.0) -> np.ndarray:
    # P_k_pred = \alpha \cdot P_k[0] 
    #  + (1 - \alpha) \cdot (\sum_{i=1,...,n} sim(P_k[i], P_k[0]) \cdot P_k[i] / Normalization term)
    # where Normalization term is the sum of the similarities of all P_k[i]

    u = np.array(r_i[0])

    agg_sim = np.add.reduce([sim(u, r_i[idx]) * np.array(r_i[idx]) for idx in range(1, len(r_i))])
    norm_factor = np.sum([sim(u, u_j) for u_j in r_i[1:]])

    return u * alpha + (1 - alpha) * (agg_sim / norm_factor)

In [29]:
@dataclass(frozen=True)
@dateformat("%Y-%m-%d %H:%M:%S")
class RevCommit:
  commit_hash: str = ''
  author_name: str = ''
  u_time: datetime = None
  commit_type: str = ''
  commit_size: str = ''
  activity: str = ''
  # functionality density:
  # \frac{\# additions}{\# additions + \# deletions}
  density: float = 0
  # \frac{density * size}{time}
  # a signal that could represent the complexity
  # of committed changes. e.g., how long it took
  # to make this commit as an indicator for commit hardness.
  effort: float = 0
  # Weighted importance of a commit: This metric is based on 
  # the importance (weights) of its touched files.
  # Let's assume that the total number of files in a repo is $F$. The
  # weighted $I[f]$ represents the importance of a given file $f \in F$,
  # and is non-zero only if $f$ has been touched by other commits 
  # (aka frequency of change or simply 'churn'). 
  # Commits touching only a few important files are important (or critical) commits.
  # (We consider heavily touched as not that important files.)
  # Inspired by the IDF concept, we compute the the importance of a commit as follows:
  # I[commit] = 0 if commit is not touching any files (or has touched a non-source code file)
  # I[commit] = 1/sum({I[f]}_{f in F_touched_by_commit}) if commit is touching source files.
  importance: float = 0
  # percentage of files touched in repo
  coverage: float = 0
  additions: int = 0
  deletions: int = 0

In [30]:
@dataclass
class Commit:
  commit_hash: str = ''
  # ty.List[str]
  files: list = field(default_factory=list)
  quality: float = 0

In [31]:
@dataclass(frozen=True)
class c_i:
  commit_hash: str = ''
  rank: float = -0.001
  density: float = 0
  effort: float = 0
  importance: float = 0
  # let's use number of files 
  # as a proxy for coverage
  # TODO(HAS) pls verify. 
  # IF coverage does not work, then
  # we can a new field called num_files
  # (in addition to coverage)
  coverage: float = 0
  quality: float = 0

In [32]:
def rank_calc(q_thr: float, e_thr: float, c: c_i) -> float:
  x = 1 if c.quality < q_thr else 0
  x = x + 1 if c.effort < e_thr else x
  # thx to https://stackoverflow.com/questions/11010683
  return -0.001 * x + 0.0

In [33]:
@contextmanager
def cd(newdir):
  prevdir = os.getcwd()
  os.chdir(os.path.expanduser(newdir))
  try:
    yield
  finally:
    os.chdir(prevdir)

In [34]:
def sigmoid(x: float) -> float:
  # to avoid RuntimeWarning: Overflow encountered in exp warning
  x = np.float128(x)
  return 1.0 / (1.0 + np.exp(-x))

In [35]:
def load_rev_history(cwd: pathlib.Path, rev_history_csv: pathlib.Path, index_json: pathlib.Path, commit_qual_csv: pathlib.Path) -> ty.Tuple[ty.List[c_i], dict]:
  rev_history = []
  with cd(cwd):
    all_commits = []
    # load revision history on a user-specified window of size W.
    with rev_history_csv.open("r") as f:
      reader = DataclassReader(f, RevCommit)
      all_commits.extend([row for row in reader])
      
    # load commit 2 files index
    with index_json.open(encoding='utf-8') as f:
      rev_index = json.load(f)
      
    qual_map = {}
    with commit_qual_csv.open("r") as f:
      r = DataclassReader(f, Commit)
      for row in r:
        if row.commit_hash not in qual_map:
          qual_map[row.commit_hash] = row.quality

    # get the quality threshold
    qual_threshold = np.mean(list(qual_map.values()))
    effort_threshold = np.mean([rev_com.effort for rev_com in all_commits])

    # sort commits by observation time
    sorted_commits = sorted(all_commits, key=lambda x: x.u_time)
    # exclude the synthetic commits
    sorted_commits = np.array([c for c in sorted_commits if c.commit_type not in ['commitless', 'Empty commit']])
    timeline = dict({rev_com.commit_hash: rev_com.u_time for rev_com in sorted_commits})
    
    sorted_commits = [c_i(commit_hash=rev_com.commit_hash,
                          density=rev_com.density, 
                          effort=rev_com.effort, 
                          importance=rev_com.importance,
                          # Number of touched files or commit's original coverage val
                          # TODO(HAS) pls verify the use of sigmoid.
                          coverage=max(sigmoid(float(len(rev_index.get(rev_com.commit_hash, [])))), rev_com.coverage),
                          quality=sigmoid(qual_map.get(rev_com.commit_hash, 0.0)))
                      for rev_com in sorted_commits]
    sorted_commits = [replace(c, rank=rank_calc(qual_threshold, effort_threshold, c)) for c in sorted_commits]
    rev_history.extend(sorted_commits)
  return np.array(rev_history), timeline


def get_revision_history(outdir: pathlib.Path) -> ty.Tuple[ty.List[c_i], dict]:
  # Search for commits matching this summary vector P_k_pred_query.
  # load revision.....
  all_commits_file = outdir / "results.csv"
  commits_2_files_file = outdir / "commit-hash-2-files.json"
  commits_qual_file = outdir / "commits-quality.csv"
  
  for f in [all_commits_file, commits_2_files_file, commits_qual_file]:
    if not f.exists():
      raise ValueError(f"Failed to locate {f.name}.")
  
  sorted_rev_history, timeline = load_rev_history(
    outdir, # directory containing git_log.py's output files
    all_commits_file, # loaded commits 
    commits_2_files_file, # commit hash 2 files JSON index
    commits_qual_file) # commit quality CSV
  
  return sorted_rev_history, timeline

In [36]:
def apply_seq_kernel(c_emb: ty.List[float], kernel_radius: int = 1, epsilon: float = 0.01) -> np.ndarray:
  # apply Gaussian kernel to a commit embeddings
  kernels = squareform(pdist(np.expand_dims(np.arange(len(c_emb)), axis=-1)))
  
  # Calculate graph kernels with cutoff of epsilon at the kernel_radius.
  kernels = np.exp((kernels * np.log(epsilon)) / kernel_radius)
  kernels[kernels < epsilon] = 0

  # Normalize kernels by dividing by row sums.
  kernels = kernels / np.expand_dims(np.sum(kernels, axis=-1), axis=-1)
  
  # Updates sequence embeddings using kernel
  c_emb_prime = np.dot(kernels, c_emb)
  
  return c_emb_prime

In [37]:
def distance_matrix(V_k_i: np.ndarray, V_k_j: np.ndarray) -> np.ndarray:
  return 1.0 - cdist(V_k_i, V_k_j, metric='cosine')

In [38]:
def scoring_matrix(a: np.ndarray, w_i: float = 1.0, w_j: float = 1.0, epsilon: float = 0.01) -> np.ndarray:
  # Pad distance matrix
  sa = np.pad(a, ((1,0),(1,0)), 'constant', constant_values = 0)
  
  # Calculate gap weight kernels
  dims = a.shape
  w_i_ = [w_i * np.exp((i * np.log(epsilon)) / dims[0]) for i in reversed(range(dims[0] + 1))]
  w_j_ = [w_j * np.exp((j * np.log(epsilon)) / dims[1]) for j in reversed(range(dims[1] + 1))]
  
  # Updates scoring matrix according to policy
  for i in range(1, dims[0] + 1):
    for j in range(1, dims[1] + 1):
      inputs = [
        # Top Left + Bottom Right
        (sa[i, j] + sa[i - 1,j - 1]),
        # Max of all previous values in column - column gap weight
        np.max(sa[:i,j]) - w_i_[i - np.argmax(sa[:i, j])],
        # Max of all previous values in row - row gap weight
        np.max(sa[i,:j]) - w_j_[j - np.argmax(sa[i, :j])],
        # Zero
        0]
      sa[i, j] = np.max(inputs)
  return sa

In [39]:
def traceback(sa: np.ndarray, k: ty.Optional[int] = 100) -> np.ndarray:
  # Sort scoring matrix values in descending order; Save coordinates in look up table.
  sorted_args = np.argsort(sa.flatten())[::-1]
  coords = [(i, j) for i in range(sa.shape[0]) for j in range(sa.shape[1])]
  
  # Perform traceback until all coords have been visited
  tracebacks = []
  seen = []
  route = []
  
  for ind in sorted_args:
    # matrix indices
    i, j = coords[ind]
    
    flag = True
    score = sa[i, j]

    while(flag):
      # Route connects to other traceback
      if (i, j) in seen:
        tracebacks.append([route, (i, j)])
        route = []
        break
      
      route.append((i, j))
      seen.append((i, j))
      
      # Route terminates at zero
      if sa[i, j] == 0:
        tracebacks.append([route, []])
        route = []

      # Select path direction
      kernel = [sa[i - 1, j], sa[i,j - 1], sa[i - 1, j - 1], sa[i, j]]
      m = np.argmax(kernel)
      
      # Move to next gap
      if m == 0:
        # Terminate route if score is less than gap value
        if score > sa[i - 1, j]:
          i -= 1
          score += sa[i, j]
        else:
          tracebacks.append([route, []])
          route = []
          break
      elif m == 1:
        # Terminate route if score is less than gap value
        if score > sa[i, j - 1]:
          j -= 1
          score += sa[i, j]
        else:
          tracebacks.append([route, []])
          route = []
          break
      # Move to next hit
      elif m in [2, 3]:
        i -= 1
        j -= 1
        score += sa[i, j]
      
      # Stop at zero or if route is too long
      if i < 0 or j < 0:
        break


  # Return alignments with length greater than 1 in order as they are found.
  if k is None:
    k = len(tracebacks)

  alignments = [] # a collection of index tuples
  for _ in tracebacks:
    # check length of routes
    if len(_[0]) > 1:
      r = [(i - 1, j - 1) for (i, j) in _[0]]
      alignments.append(r[:-1])
    if len(alignments) == k:
      break

  # print(f"alignments = {len(alignments)}; {alignments[0]}")
  return alignments

In [40]:
def score_alignment(alignment: np.ndarray, s1: np.ndarray, s2: np.ndarray, k: int) -> float:
  # Find gaps and hits, and gather feature vectors
  temp_i = []
  temp_j = []
  
  i = -1
  j = -1
  s1_ = []
  s2_ = []

  for _ in alignment:
    if _[0] != i:
      temp_i.append(1)
      i = _[0]
    else: temp_i.append(0.0)
    if _[1] != j:
      temp_j.append(1)
      j = _[1]
    else: temp_j.append(0.0)
    s1_.append(s1[_[0]])
    s2_.append(s2[_[1]])
  
  # Calculate similarity score
  mask = np.array(temp_i) * np.array(temp_j)
  similarity = 2 - cdist(s1_, s2_, 'cosine').diagonal()
  score = (similarity * mask) / (2 * len(alignment)) * (np.sum(mask) / len(s2)) * k * len(s2)

  return score[0]

In [41]:
def print_commit_results(top_alignments: np.ndarray, top_scores: np.ndarray, input_vecs: ty.List[np.ndarray], vec_index: dict = None) -> None:
  assert len(input_vecs) == 2
  if len(top_alignments) > 1:
    print("Top", len(top_alignments), 'alignments:')

  for i, alignment in enumerate(top_alignments):
    ss1 = []
    ss2 = []
    l = -1
    j = -1
    for _ in reversed(alignment):
      if _[0] != l:
        if vec_index:
          cands_l = vec_index.get(tuple(input_vecs[0][_[0]]), None)
          cands_l = [tpl[1] for tpl in cands_l if tpl[0] == 0]
          ss1 += [cands_l[0] if len(cands_l) > 0 else None]
        else: ss1.append(input_vecs[0][_[0]])        
        # ss1.append(input_vecs[0][_[0]])
        l = _[0]
      else:
        ss1.append('GAP')
      
      if _[1] != j:
        if vec_index:
          cands_j = vec_index.get(tuple(input_vecs[1][_[1]]), None)
          cands_j = [tpl[1] for tpl in cands_j if tpl[0] == 1]
          ss2 += [cands_j[0] if len(cands_j) > 0 else None]
        else: ss2.append(input_vecs[1][_[1]])
        # ss2.append(input_vecs[1][_[1]])
        j = _[1]
      else:
        ss2.append('GAP')

    print('Alignment', i + 1, ':', 'Score:', top_scores[i])
    print("Seq 1:")
    print(np.array(ss1))
    print("Seq 2:")
    print(np.array(ss2), '\n\n')

In [42]:
class Aligner:
  # Problem: Given two sequences of commits, find a correspondence 
  # mapping between the commits (in their vector form) that correspond 
  # to the same intent (meaning) within the context. In other words,
  # we are interested in finding the inter-developer-activity dependencies
  # between these two sequences. These sequences may represent trajectories
  # or even a sequence of code landmarks for all revisions in a repository.
  
  # **Planned Use Case:** given a documented social event and two 
  # repositories' revision histories, say TensorFlow and PyTorch, we want to 
  # find whether there is a semantic correspondence between the two repositories's 
  # revision histories matching the well-documented social event. The latter would
  # verified manually by a group of humans.
  
  # This idea is inspired by the Smith-Waterman algorithm commonly used 
  # in bio-informatics for local alignment of genetic sequences.
  # See trajectories.txt for additional details.

  def __init__(self, kernel_size: int = 1) -> None:
    self.kernel_size = kernel_size
  
  def align(self, V_i: np.ndarray, V_j: np.ndarray, w: ty.Tuple[float] = (0.25,0.25)) -> None:
    # **Note:** This function currently aligns the first two sequences or partitions in P_K.
    # Future TODO(HAS) make the selection of partitions user-selectable.
    
    # Gather vector representation for commits in each V_k
    self.commit_vectors = [V_i, V_j]
    
    # Search for sequence alignments for each search v_i \in V_K along commit_db
    all_alignments = []
    alignment_scores = []
    
    # Apply sequence kernels of radius len(search_commit) to search for commits
    v_i_1 = apply_seq_kernel(self.commit_vectors[0], self.kernel_size)
    v_j_1 = apply_seq_kernel(self.commit_vectors[1], self.kernel_size)
    
    # Calculate cosine similarity between search commit and list of commits
    cos_dist = distance_matrix(v_i_1, v_j_1)
    
    # Calculate scoring matrix for sequence alignment
    score = scoring_matrix(cos_dist, w_i = w[0], w_j = w[1])
    
    # Find first k alignments of len > 1
    alignments = traceback(score, k=None)
    for j, _ in enumerate(alignments):
      all_alignments.append(_)
      alignment_scores.append(score_alignment(_, self.commit_vectors[0], v_j_1, 1 - (j / len(alignments))))
    
    assert len(alignment_scores) == len(all_alignments), "Should be the same, right?"
    
    self.all_alignments = all_alignments
    
    # Sort alignment scores while tracking their indices
    self.sorted_scores = np.argsort(alignment_scores)[::-1]
    
    # Compile top results
    self.alignments = np.array(alignments)
    self.alignment_scores = np.array(alignment_scores)


  def compile_top_results(self, k: int = 10) -> ty.Tuple[np.ndarray, np.ndarray]:
    # Compile top results    
    top_alignments = self.alignments[self.sorted_scores[:k].astype(int)]
    top_scores = self.alignment_scores[self.sorted_scores[:k].astype('int')]
    
    assert len(self.commit_vectors) == 2

    return top_alignments, top_scores

### &#9759; Sequence Generation Functions

In [43]:
def create_revision_history_dataframe(rev_history: np.ndarray, rev_timeline: Dict) -> pd.DataFrame:

    result = []

    for record in rev_history:
        
        timeline = np.datetime64(rev_timeline[record.commit_hash])
        
        result.append({
            'commit_hash': record.commit_hash, 
            'timeline': timeline,
            'rank': record.rank, 
            'density': record.density, 
            'effort': record.effort, 
            'importance': record.importance, 
            'coverage': record.coverage, 
            'quality': record.quality})

    df_result = pd.DataFrame(result)
    return df_result

In [44]:
def generate_and_fuse(df_activity: pd.DataFrame, 
                      df_contributions: pd.DataFrame, 
                      start_time: pd.Timestamp, 
                      end_time: pd.Timestamp, 
                      date_offset: pd.DateOffset, 
                      alpha: float = 0.5) -> pd.DataFrame:
    """Main method which fuses developer activities and code contribution vectors
       present within a given time window.

    Args:
      df_activity: developer activity dataframe
      df_contributions: code contributions dataframe
      start_time: starting window of operations
      end_time: ending window of operations
      date_offset: offset to move the time window
      alpha: alpha value, default 0.5

    Returns:
      The resulting dataframe with containing the fusion results
    """

    range_time = start_time + date_offset

    fusion_results_list = []

    cnt_time_range = 0
    cnt_correct_dev_df = 0
    cnt_correct_cc_df = 0

    while range_time <= end_time:
        # 0. collect all developers (D) with activity records in [start_time, range_time]
        df_developer_activity = get_records_in_time_window(
            input_df=df_activity, 
            time_column='sent_time', 
            start_time=start_time, 
            end_time=range_time)
        
        ## 0.1. drop all nan records from the df in step 0.
        df_developer_activity = df_developer_activity.dropna()

        ## proceed to next step if there is at least one record in the activity df
        if df_developer_activity.shape[0] > 0:
            ## 0.2. collect all the unique developers, np.ndarray
            developers = df_developer_activity.sender_id.unique()
            # print(f"- In [{start_time} - {range_time}] there are:\t{len(developers)} developers.")
            
            cnt_correct_dev_df += 1

            # 1. collect all the code contribution records in [START_TIME, END_TIME]
            df_code_contributions = get_records_in_time_window(
                input_df=df_contributions, 
                time_column='timeline', 
                start_time=start_time, 
                end_time=range_time)
            
            ## 1.1. drop all nan records from the df in step 1.
            df_code_contributions = df_code_contributions.dropna()

            ## proceed to next step if there is at least one record in the code contributions df
            if df_code_contributions.shape[0] > 0:
                # cnt_correct_cc_df += 1

                ## 1.2. filter code contribution records to only columns of interest
                df_cc_interest = select_columns_from_dataframe(
                    input_df=df_code_contributions, 
                    columns=['commit_hash', 'rank', 'density', 'effort', 'importance', 'coverage', 'quality'])
                
                ## 1.2.1. create a numpy array of commit hashes within the time period of interest
                commit_hashes_array = df_cc_interest.commit_hash.to_numpy()

                ## 1.2.2. drop the 'commit_hash' column from df_cc_interest, not needed for fusion
                df_cc_interest = df_cc_interest.drop('commit_hash', axis=1)

                # 2. create code contribution summary vector (CCSV) for data collected in 1.1
                ## 2.1. convert df_cc_interest to numpy ndarray
                np_cc_interest = df_cc_interest.to_numpy()

                ## 2.2. remove all-zero rows
                np_cc_interest = remove_empty_rows(input_array=np_cc_interest)

                if len(np_cc_interest) > 0:
                    ## 2.3. summary vector (CCSV)
                    cc_summary_vector = rev_pred(r_i=np_cc_interest, alpha=ALPHA)

                    ## 2.4. proceed if there is no nan value in the summary vector
                    if not np.isnan(cc_summary_vector).any():
                        cnt_correct_cc_df += 1

                        # 3. for each d in D
                        for dev in developers:
                            ## 3.1. collect all activity records of d in [START_TIME, END_TIME]
                            df_dev_activities = get_records(
                                input_df=df_developer_activity, 
                                condition_column='sender_id', 
                                condition_value=dev)
                            
                            ## 3.2. filter developer activities to only columns of interest
                            df_da_interest = select_columns_from_dataframe(
                                input_df=df_dev_activities, 
                                columns=['activity_label', 'Code Contribution', 'Knowledge Sharing', 'Patch Posting', 'Progress Control', 'Acknowledgement and Response'])
                            
                            ## 3.2.1. create a numpy array of developer activities within the time period of interest
                            dev_activities_array = df_da_interest.activity_label.to_numpy()

                            ## 3.2.2. drop the 'activity_label' column from the df_da_interest, not needed for fusion
                            df_da_interest = df_da_interest.drop('activity_label', axis=1)
                            
                            ## 3.3. convert the dataframe in 3.2 into a numpy ndarray
                            np_da_interest = df_da_interest.to_numpy()

                            ## 3.4. remove all-zero rows
                            np_da_interest = remove_empty_rows(input_array=np_da_interest)

                            if len(np_da_interest) > 0:
                                ## 3.5. create activity summary vector (ASV) for data collected in 3.4.
                                ac_summary_vector = rev_pred(r_i=np_da_interest, alpha=ALPHA)

                                ## 3.6. proceed if there is no nan value in the summary vector
                                if not np.isnan(ac_summary_vector).any():
                                    ## 3.7. fusion of (CCSV, ASV)
                                    fusion_vector = tensor_fusion(h_x=cc_summary_vector, h_y=ac_summary_vector)

                                    ## 3.8. fusion ndarray flattened, 
                                    ## see: https://numpy.org/doc/stable/reference/generated/numpy.ndarray.flatten.html
                                    fusion_flatten = fusion_vector.flatten()

                                    ## 3.9. append dictionary to FUSION_RESULTS_LIST
                                    fusion_results_list.append(
                                        {'sender_id': dev, 
                                         'start_time': start_time, 
                                         'end_time': range_time, 
                                         'hashes_vector': commit_hashes_array.flatten(), 
                                         'activities_vector': dev_activities_array.flatten(),
                                         'fusion_vector': fusion_flatten})

        if cnt_time_range % 10 == 0:
            print(f"- Processed {cnt_time_range} time windows!")

        cnt_time_range += 1
        start_time = range_time
        range_time = start_time + date_offset

    
    print(f"- Total time windows: {cnt_time_range}.\n- Total correct dev windows: {cnt_correct_dev_df}.\n- Total correct code contribution windows: {cnt_correct_cc_df}")
    df_output_dataset = pd.DataFrame(fusion_results_list)

    return df_output_dataset

### &#9759; Alignment Functions

In [174]:
def compute_alignments(input_dict: Dict[int, np.ndarray], target_dev: int, w: Tuple[float, float] = (0.25,0.25), k: int = 1):
    
    assert target_dev in input_dict
    
    v_i = input_dict[target_dev]['fusion']
    # v_i_activities = input_dict[target_dev]['activities']
    v_i_hashes = input_dict[target_dev]['hashes']
    # print(len(v_i), v_i.shape)

    v_j = np.concatenate(([input_dict[v]['fusion'] for v in input_dict if v != target_dev]))
    # v_j_activities = np.concatenate(([input_dict[v]['activities'] for v in input_dict if v != target_dev]))
    v_j_hashes = np.concatenate(([input_dict[v]['hashes'] for v in input_dict if v != target_dev]))
    # print(len(v_j), v_j.shape)

    # print(len(v_j), len(v_j_hashes))

    commit_aligner = Aligner(kernel_size=len(v_i))
    commit_aligner.align(v_i, v_j, w=w)
    top_alignments, top_scores = commit_aligner.compile_top_results(k=k)
    # print(top_alignments, top_scores)
    
    return top_alignments, top_scores, v_i_hashes, v_j_hashes

In [144]:
def hover_content(input_df: pd.DataFrame) -> List[str]:
    res_hover = []
    for idx, row in input_df.iterrows():
        res_hover.append(f"norm: {row.norm_value:.3f}<br />effort: {row.effort:.3f}<br />density: {row.density:.3f}<br />importance: {row.importance:.3f}<br />coverage: {row.coverage:.3f}<br />quality: {row.quality:.3f}")
    
    return res_hover

In [169]:
def plot_alignments(xx, yy, target_x, target_y, target_hover, remainder_x, remainder_y, remainder_hover):
    fig = go.Figure()

    for i, j in zip(xx, yy):
        fig.add_trace(
                go.Scatter(
                    x = [i[0], j[0]],
                    # y = [i[1].astype('float32'), j[1].astype('float32')],
                    y = [i[1], j[1]],
                    line=dict(color="lightgray"),
                    showlegend=False
                )
            )

    fig.add_trace(
            go.Scatter(
                x = target_x,
                y = target_y,
                name = 'Selected Developer',
                line=dict(color="red"),
                hovertext=target_hover,
                hoverinfo="text",
            )
        )

    fig.add_trace(
            go.Scatter(
                x = remainder_x,
                y = remainder_y,
                name = 'All Developers',
                line=dict(color="blue"),
                hovertext=remainder_hover,
                hoverinfo="text",
            )
        )

    fig.show()    

In [173]:
def compute_and_visualize_alignments(input_df: pd.DataFrame, target_year: int, period_start: int, period_end: int, target_dev_id: int, top_k: int) -> None:

    tmp_df = input_df.loc[(input_df['start_time'].dt.isocalendar().week >= period_start) & (input_df['end_time'].dt.isocalendar().week <= period_end)]

    no_devs = tmp_df.sender_id.unique()

    print(f"Year: {target_year}\nStarting Week: {period_start}\nEnding Week: {period_end}\nResulting dataframe records: {tmp_df.shape[0]}\nNo. of unique devs in this period: {len(no_devs)}")

    print(f"- Producing developer fusion sequences...")
    dev_fusion_seq_dict = developer_sequences(input_df=tmp_df, start_period=period_start, end_period=period_end)

    print(f"- Computing alignments...")
    ta_0, ts_0, vi0_h, vj0_h = compute_alignments(input_dict=dev_fusion_seq_dict, target_dev=target_dev_id, k=top_k)

    print(f"- Resulting alignments:")
    idx_cnt = 0
    for i in ta_0:
        print(f"-- Alignments at index: {i}\talignment score: {ts_0[idx_cnt]:.3f}")
        for tpl in i:
            print(tpl, vi0_h[tpl[0]], vj0_h[tpl[1]])
        
        idx_cnt += 1

    print(f"- Visualizing results")
    # target
    df_dev_0_rev_history = get_revision_history_data(
        input_df=df_revision_history, 
        year=target_year, 
        start_period=period_start, 
        end_period=period_end, 
        hashes=list(set(vi0_h)))

    df_dev_0_rev_history['norm_value'] = df_dev_0_rev_history['norm_value'] + 0.35

    D0 = df_dev_0_rev_history.copy().reset_index()

    D0.drop(['index'], axis=1, inplace=True)

    res_d0 = {}
    for idx, row in D0.iterrows():
        res_d0[row.commit_hash] = {'index': idx, 'norm_value': row.norm_value}    

    # remainder of developers
    df_dev_A_rev_history = get_revision_history_data(
        input_df=df_revision_history, 
        year=target_year, 
        start_period=period_start, 
        end_period=period_end, 
        hashes=vj0_h)

    A = df_dev_A_rev_history.copy().reset_index()

    A.drop(['index'], axis=1, inplace=True)

    res_A = {}
    for idx, row in A.iterrows():
        res_A[row.commit_hash] = {'index': idx, 'norm_value': row.norm_value}

    # xx and yy
    xx = []
    yy = []
    for i in ta_0:
        for tpl in i:
            xx.append((res_d0[vi0_h[tpl[0]]]['index'], res_d0[vi0_h[tpl[0]]]['norm_value']))
            yy.append((res_A[vj0_h[tpl[1]]]['index'], res_A[vj0_h[tpl[1]]]['norm_value']))

    # on hover data
    D0_hover = hover_content(input_df=D0)

    A_hover = hover_content(input_df=A)

    # visualization via plotly
    plot_alignments(
        xx=xx, 
        yy=yy, 
        target_x=D0.index, 
        target_y=D0.norm_value, 
        target_hover=D0_hover,
        remainder_x=A.index, 
        remainder_y=A.norm_value, 
        remainder_hover=A_hover)    

## &#9749; Download data

### &#9759; Linux Kernel Data

In [46]:
# linux kernel (.zip) data files
linux_kernel_data = download_data_from_google_drive(
    google_file_id=DATA_DICT['linux-kernel-data'], 
    output_file_name="linux-kernel-data.zip", 
    quiet_download=False)

Downloading...
From: https://drive.google.com/uc?id=1h1AGfQkOhvgtcCR8tWVzObBSpBTURWM2
To: /content/linux-kernel-data.zip
100%|██████████| 84.6M/84.6M [00:02<00:00, 37.5MB/s]


### &#9759; Developer Activities Data

In [47]:
# activity dataframe
activities_data = download_data_from_google_drive(
    google_file_id=DATA_DICT['activity_triplets_V1_02182022'], 
    output_file_name="activity_triplets_V1_02182022.csv", 
    quiet_download=False)

Downloading...
From: https://drive.google.com/uc?id=1BUd2sAUP04Jf1Qnin0uhMeAXaN3gCyTy
To: /content/activity_triplets_V1_02182022.csv
100%|██████████| 9.53M/9.53M [00:00<00:00, 38.8MB/s]


## &#128722; Load Data

### &#9759; Linux Kernel Data

In [48]:
with zipfile.ZipFile(f'./{linux_kernel_data}', 'r') as lk_data:
    lk_data.extractall('./linux_kernel')

In [49]:
path_linux_kernel_data = os.path.join(os.getcwd(), 'linux_kernel')
path_linux_kernel_data

'/content/linux_kernel'

In [50]:
os.listdir(path_linux_kernel_data)

['augmented_processed_dev_df_AUG-NOV-2020_1.csv',
 '20220525_augmented_processed_dev_df_AUG-NOV-2020.csv',
 'commit-hash-2-files_1.json',
 'authors_1.csv',
 'commit-summary-series_1.csv',
 'revision-quality_2.csv',
 'authors_aliases.csv',
 'query_results.csv',
 'authors-karma_1.csv',
 'results.csv',
 'commits-quality_2.csv',
 'revision-quality_1.csv',
 'code_landmarks_1.csv',
 'commit-summary-series.csv',
 'results_1.csv',
 'P_K_preds.json',
 'commits-quality_1.csv',
 'authors-karma_3.csv',
 'commits-quality.csv',
 'authors_2.csv',
 'authors_3.csv',
 '__MACOSX',
 'P_K_preds_1.json',
 'authors_dedup.csv',
 'commit-hash-2-files.json',
 'authors.csv',
 'authors-karma.csv',
 'authors-karma_2.csv',
 'revision-quality.csv',
 'code_landmarks.csv',
 'query_results_1.csv',
 'augmented_processed_dev_df_AUG-NOV-2020_v1_20220528.csv']

#### &#9758; LK Results Data

In [51]:
df_lk_results = pd.read_csv(os.path.join(path_linux_kernel_data, 'results.csv'))

In [52]:
df_lk_results['u_time'] = pd.to_datetime(df_lk_results['u_time'], utc=True)

In [53]:
df_lk_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27268 entries, 0 to 27267
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   commit_hash  27268 non-null  object             
 1   author_name  27268 non-null  object             
 2   u_time       27268 non-null  datetime64[ns, UTC]
 3   commit_type  27268 non-null  object             
 4   commit_size  27268 non-null  object             
 5   activity     27268 non-null  object             
 6   density      27268 non-null  float64            
 7   effort       27268 non-null  float64            
 8   importance   27268 non-null  float64            
 9   coverage     27268 non-null  float64            
 10  additions    27268 non-null  int64              
 11  deletions    27268 non-null  int64              
dtypes: datetime64[ns, UTC](1), float64(4), int64(2), object(5)
memory usage: 2.5+ MB


In [54]:
df_lk_results.describe()

Unnamed: 0,density,effort,importance,coverage,additions,deletions
count,27268.0,27268.0,27268.0,27268.0,27268.0,27268.0
mean,0.282763,0.220208,0.032981,0.072243,16.282309,7.187252
std,0.367667,0.263625,0.096999,0.129588,321.087928,76.016061
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.5,0.500061,0.02,0.1,4.0,2.0
max,1.0,1.0,1.0,1.0,47514.0,7518.0


In [55]:
df_lk_results.head()

Unnamed: 0,commit_hash,author_name,u_time,commit_type,commit_size,activity,density,effort,importance,coverage,additions,deletions
0,11740ef4482914fcd8c9814ef7ceb7085715e554,Andy Grover,2011-01-13 11:40:31+00:00,Mixed commit,Big,Fix code,0.98,0.500003,0.009091,0.056604,49,1
1,d6c8204659eb1846c444997ee28fe9d7e5442f4e,Russell King,2016-08-31 00:49:53+00:00,Mixed commit,Small,Fix code,0.666667,0.5,0.019231,0.018868,2,1
2,658829dfe75c49e879e0c4c9cbcd3bd1e4fbdcf5,Geliang Tang,2017-05-06 08:37:20+00:00,Deletion commit,Small,Delete old code,0.0,0.0,0.166667,0.037736,0,2
3,22648c989cb8305f51b96b5962df8674697bb2ab,Siva Durga Prasad Paladugu,2018-02-08 02:02:45+00:00,Mixed commit,Small,Fix code,0.384615,0.500001,0.333333,0.018868,5,8
4,d9e5582c4bb219f3459e39f65410f0e5128fbe91,Baolin Wang,2018-04-24 05:06:12+00:00,Mixed commit,Big,Fix code,0.550847,0.500019,0.003279,0.169811,65,53


In [56]:
df_lk_results.commit_type.value_counts()

Mixed commit       8953
Empty commit       8275
commitless         6669
Addition commit    2596
Deletion commit     769
Single commit         5
Initial commit        1
Name: commit_type, dtype: int64

#### &#9758; Revision History Data

In [57]:
path_lk = pathlib.Path(path_linux_kernel_data)

In [58]:
revision_history, revision_timeline = get_revision_history(outdir=path_lk)

In [59]:
df_revision_history = create_revision_history_dataframe(rev_history=revision_history, rev_timeline=revision_timeline)

In [60]:
df_revision_history['timeline'] = pd.to_datetime(df_revision_history['timeline'], utc=True)

In [61]:
df_revision_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12324 entries, 0 to 12323
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   commit_hash  12324 non-null  object             
 1   timeline     12324 non-null  datetime64[ns, UTC]
 2   rank         12324 non-null  float64            
 3   density      12324 non-null  float64            
 4   effort       12324 non-null  float64            
 5   importance   12324 non-null  float64            
 6   coverage     12324 non-null  float64            
 7   quality      12324 non-null  float128           
dtypes: datetime64[ns, UTC](1), float128(1), float64(5), object(1)
memory usage: 866.7+ KB


In [62]:
df_revision_history.head()

Unnamed: 0,commit_hash,timeline,rank,density,effort,importance,coverage,quality
0,11740ef4482914fcd8c9814ef7ceb7085715e554,2011-01-13 11:40:31+00:00,-0.001,0.98,0.500003,0.009091,0.952574,0.731059
1,d6c8204659eb1846c444997ee28fe9d7e5442f4e,2016-08-31 00:49:53+00:00,-0.001,0.666667,0.5,0.019231,0.731059,0.731059
2,658829dfe75c49e879e0c4c9cbcd3bd1e4fbdcf5,2017-05-06 08:37:20+00:00,-0.002,0.0,0.0,0.166667,0.880797,0.731059
3,22648c989cb8305f51b96b5962df8674697bb2ab,2018-02-08 02:02:45+00:00,-0.001,0.384615,0.500001,0.333333,0.731059,0.731059
4,d9e5582c4bb219f3459e39f65410f0e5128fbe91,2018-04-24 05:06:12+00:00,-0.001,0.550847,0.500019,0.003279,0.999877,0.731059


### &#9759; Activities Data

In [64]:
df_activities = pd.read_csv(activities_data, sep='\t')

In [65]:
# convert 'sent_time' to pandas datetime64 time format
df_activities['sent_time'] = pd.to_datetime(df_activities['sent_time'], utc=True)

# sort the content of the DataFrame by 'sent_time'
df_activities = df_activities.sort_values(by=['sent_time'])

In [66]:
df_activities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27411 entries, 24091 to 3614
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype              
---  ------                        --------------  -----              
 0   Unnamed: 0                    27411 non-null  int64              
 1   Unnamed: 0.1                  27411 non-null  int64              
 2   Unnamed: 0.1.1                27411 non-null  int64              
 3   sender_id                     27411 non-null  int64              
 4   sent_time                     27411 non-null  datetime64[ns, UTC]
 5   Code Contribution             27411 non-null  float64            
 6   Knowledge Sharing             27411 non-null  float64            
 7   Patch Posting                 27411 non-null  float64            
 8   Progress Control              27411 non-null  float64            
 9   Acknowledgement and Response  27411 non-null  float64            
 10  Composite Index               2

In [67]:
df_activities.describe()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,sender_id,Code Contribution,Knowledge Sharing,Patch Posting,Progress Control,Acknowledgement and Response,Composite Index,Rank,comb_2,comb_3,is_triage,is_bug_fix,is_controversial
count,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0,27411.0
mean,13705.0,13705.0,13705.0,466.76141,0.249106,0.002233,0.106345,0.032836,0.031109,0.421628,13706.0,1.30827,3.640911,0.49243,0.375835,0.119332
std,7913.018451,7913.018451,7913.018451,535.754602,0.069497,0.006065,0.054297,0.021204,0.015424,0.095375,7913.018451,0.981164,0.85643,0.499952,0.484346,0.324184
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042519,1.0,0.0,0.0,0.0,0.0,0.0
25%,6852.5,6852.5,6852.5,39.0,0.260254,0.000728,0.062329,0.024287,0.022645,0.375485,6853.5,1.0,3.0,0.0,0.0,0.0
50%,13705.0,13705.0,13705.0,254.0,0.269938,0.001165,0.12623,0.029335,0.025764,0.449596,13706.0,1.0,4.0,0.0,0.0,0.0
75%,20557.5,20557.5,20557.5,737.0,0.278295,0.002111,0.153634,0.034573,0.029421,0.486896,20558.5,1.0,4.0,1.0,1.0,0.0
max,27410.0,27410.0,27410.0,2223.0,0.317105,0.263279,0.176351,0.158647,0.084617,0.695572,27411.0,9.0,9.0,1.0,1.0,1.0


In [68]:
df_activities.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,sender_id,sent_time,Code Contribution,Knowledge Sharing,Patch Posting,Progress Control,Acknowledgement and Response,...,comb_2,comb_3,is_triage,is_bug_fix,is_controversial,activity_label,project_thread,sendername_thread,triplet_one,triplet_two
24091,24091,24091,24091,0,2020-08-20 09:35:52+00:00,0.262744,0.002416,0.000969,0.032951,0.022704,...,2,5,0,0,1,Code Contribution,project_name_undefined_unknown,Greg_Kroah-Hartman_unknown,Code_Contribution_project_name_undefined_unknown,Code_Contribution_Greg_Kroah-Hartman_unknown
24807,24807,24807,24807,0,2020-08-20 09:35:55+00:00,0.243353,0.001691,0.000833,0.028051,0.021198,...,2,5,0,0,1,Code Contribution,project_name_undefined_unknown,Greg_Kroah-Hartman_unknown,Code_Contribution_project_name_undefined_unknown,Code_Contribution_Greg_Kroah-Hartman_unknown
22403,22403,22403,22403,0,2020-08-20 09:36:02+00:00,0.283699,0.000956,0.000653,0.03624,0.022395,...,2,5,0,0,1,Code Contribution,project_name_undefined_unknown,Greg_Kroah-Hartman_unknown,Code_Contribution_project_name_undefined_unknown,Code_Contribution_Greg_Kroah-Hartman_unknown
24838,24838,24838,24838,0,2020-08-20 09:36:06+00:00,0.249099,0.000994,0.000669,0.014546,0.022657,...,3,5,0,0,1,Code Contribution,project_name_undefined_unknown,Greg_Kroah-Hartman_unknown,Code_Contribution_project_name_undefined_unknown,Code_Contribution_Greg_Kroah-Hartman_unknown
23953,23953,23953,23953,0,2020-08-20 09:36:09+00:00,0.282001,0.001464,0.000143,0.016604,0.023747,...,3,5,0,0,1,Code Contribution,project_name_undefined_unknown,Greg_Kroah-Hartman_unknown,Code_Contribution_project_name_undefined_unknown,Code_Contribution_Greg_Kroah-Hartman_unknown


## &#9763; Sequence Generation and Fusion

Computes the Kronecker product, a composite array made of blocks of the second array scaled by the first, see Ref. 1.

**np.kron** assumes that the number of dimensions of a and b are the same, if necessary prepending the smallest with ones., see Ref. 1.

In [69]:
# Parameters
START_TIME = df_activities.sent_time.min()
END_TIME = df_activities.sent_time.max()

print(f"Operating time range: {START_TIME} - {END_TIME}")

# refer to: https://pandas.pydata.org/docs/reference/api/pandas.tseries.offsets.DateOffset.html
# for different offsets, e.g., seconds, minutes, days, years,...
DATE_OFFSET = pd.DateOffset(hours=12)

ALPHA=0.5

Operating time range: 2020-08-20 09:35:52+00:00 - 2020-11-23 14:16:07+00:00


In [70]:
df_output = generate_and_fuse(
    df_activity=df_activities, 
    df_contributions=df_revision_history, 
    start_time=START_TIME, 
    end_time=END_TIME, 
    date_offset=DATE_OFFSET, 
    alpha=ALPHA)

- Processed 0 time windows!
- Processed 10 time windows!
- Processed 20 time windows!
- Processed 30 time windows!
- Processed 40 time windows!
- Processed 50 time windows!
- Processed 60 time windows!
- Processed 70 time windows!
- Processed 80 time windows!
- Processed 90 time windows!
- Processed 100 time windows!
- Processed 110 time windows!
- Processed 120 time windows!
- Processed 130 time windows!
- Processed 140 time windows!
- Processed 150 time windows!
- Processed 160 time windows!
- Processed 170 time windows!
- Processed 180 time windows!
- Total time windows: 190.
- Total correct dev windows: 91.
- Total correct code contribution windows: 86


In [71]:
df_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3382 entries, 0 to 3381
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   sender_id          3382 non-null   int64              
 1   start_time         3382 non-null   datetime64[ns, UTC]
 2   end_time           3382 non-null   datetime64[ns, UTC]
 3   hashes_vector      3382 non-null   object             
 4   activities_vector  3382 non-null   object             
 5   fusion_vector      3382 non-null   object             
dtypes: datetime64[ns, UTC](2), int64(1), object(3)
memory usage: 158.7+ KB


In [72]:
df_output.head()

Unnamed: 0,sender_id,start_time,end_time,hashes_vector,activities_vector,fusion_vector
0,0,2020-08-20 09:35:52+00:00,2020-08-20 21:35:52+00:00,"[59c0d31988fb366189502a8ac66b7fe1486b7e40, 4a5...","[Code Contribution, Code Contribution, Code Co...","[-0.0010764808756718475061, -0.000287765372909..."
1,1,2020-08-20 09:35:52+00:00,2020-08-20 21:35:52+00:00,"[59c0d31988fb366189502a8ac66b7fe1486b7e40, 4a5...","[Code Contribution, Code Contribution, Code Co...","[-0.0010764808756718475061, -0.000286236232147..."
2,3,2020-08-20 09:35:52+00:00,2020-08-20 21:35:52+00:00,"[59c0d31988fb366189502a8ac66b7fe1486b7e40, 4a5...","[Code Contribution, Code Contribution]","[-0.0010764808756718475061, -0.000273482397709..."
3,4,2020-08-20 09:35:52+00:00,2020-08-20 21:35:52+00:00,"[59c0d31988fb366189502a8ac66b7fe1486b7e40, 4a5...","[Code Contribution, Code Contribution, Code Co...","[-0.0010764808756718475061, -0.000291447955136..."
4,7,2020-08-20 09:35:52+00:00,2020-08-20 21:35:52+00:00,"[59c0d31988fb366189502a8ac66b7fe1486b7e40, 4a5...","[Code Contribution, Code Contribution, Code Co...","[-0.0010764808756718475061, -0.000293893192276..."


In [73]:
df_output.sender_id.value_counts().sort_values()

2183     1
2139     1
2137     1
2053     1
105      1
        ..
142     31
139     32
101     36
8       37
39      59
Name: sender_id, Length: 993, dtype: int64

In [74]:
# 7. save the dataframe to a .csv file
if SAVE_RESULTS:
    print("Saving dataframe...")
    path_output_file = f"fusion_sequences-{str(TODAYS_DATE)}.csv"
    df_output.to_csv(path_output_file, sep='\t')
    
    check_file_status(input_path=path_output_file)

Saving dataframe...
- File fusion_sequences-2022-09-14.csv exists locally at fusion_sequences-2022-09-14.csv!


## &#127919; Alignment

In [152]:
# add the week of the year column as 'period_week' to df_revision_history
df_revision_history['period_week'] = df_revision_history.timeline.dt.isocalendar().week

In [153]:
# compute the norm of each row[rank:quality]
df_revision_history['norm_value'] = df_revision_history.apply(lambda row: np.linalg.norm(row[2:8]), axis=1)

In [154]:
df_revision_history.head()

Unnamed: 0,commit_hash,timeline,rank,density,effort,importance,coverage,quality,period_week,norm_value
0,11740ef4482914fcd8c9814ef7ceb7085715e554,2011-01-13 11:40:31+00:00,-0.001,0.98,0.500003,0.009091,0.952574,0.731059,2,1.628598
1,d6c8204659eb1846c444997ee28fe9d7e5442f4e,2016-08-31 00:49:53+00:00,-0.001,0.666667,0.5,0.019231,0.731059,0.731059,35,1.328047
2,658829dfe75c49e879e0c4c9cbcd3bd1e4fbdcf5,2017-05-06 08:37:20+00:00,-0.002,0.0,0.0,0.166667,0.880797,0.731059,18,1.156733
3,22648c989cb8305f51b96b5962df8674697bb2ab,2018-02-08 02:02:45+00:00,-0.001,0.384615,0.500001,0.333333,0.731059,0.731059,6,1.256159
4,d9e5582c4bb219f3459e39f65410f0e5128fbe91,2018-04-24 05:06:12+00:00,-0.001,0.550847,0.500019,0.003279,0.999877,0.731059,17,1.444875


In [165]:
# plotly likes float32 data types
df_revision_history = df_revision_history.astype({"norm_value": 'float32'})

In [166]:
df_revision_history.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12324 entries, 0 to 12323
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype              
---  ------       --------------  -----              
 0   commit_hash  12324 non-null  object             
 1   timeline     12324 non-null  datetime64[ns, UTC]
 2   rank         12324 non-null  float64            
 3   density      12324 non-null  float64            
 4   effort       12324 non-null  float64            
 5   importance   12324 non-null  float64            
 6   coverage     12324 non-null  float64            
 7   quality      12324 non-null  float128           
 8   period_week  12324 non-null  UInt32             
 9   norm_value   12324 non-null  float32            
dtypes: UInt32(1), datetime64[ns, UTC](1), float128(1), float32(1), float64(5), object(1)
memory usage: 975.0+ KB


In [175]:
compute_and_visualize_alignments(
    input_df=df_output, 
    target_year=2020, 
    period_start=34, 
    period_end=35, 
    target_dev_id=0, 
    top_k=3)

Year: 2020
Starting Week: 34
Ending Week: 35
Resulting dataframe records: 310
No. of unique devs in this period: 197
- Producing developer fusion sequences...
- Computing alignments...
- Resulting alignments:
-- Alignments at index: [(4, 299)]	alignment score: 0.988
(4, 299) 50df0eebbd49b894df7e5e3945d66cd80c322284 06be67266a0c9a6a1ffb330a4ab50c2f21612e2b
-- Alignments at index: [(4, 9), (3, 8), (2, 7), (1, 6), (0, 5)]	alignment score: 0.970
(4, 9) 50df0eebbd49b894df7e5e3945d66cd80c322284 5bf01b571cf45db12a34f53b1ce96e044c030ae6
(3, 8) 204361a77f4018627addd4a06877448f088ddfc0 3ad1b1e16dbff695f430b7d7ac0b6e98c02065c2
(2, 7) dca5612f8eb9d0cf1dc254eb2adff1f16a588a7d 59c0d31988fb366189502a8ac66b7fe1486b7e40
(1, 6) 5e0b17b026eb7c6de9baa9b0d45a51b05f05abe1 59c0d31988fb366189502a8ac66b7fe1486b7e40
(0, 5) 59c0d31988fb366189502a8ac66b7fe1486b7e40 50df0eebbd49b894df7e5e3945d66cd80c322284
-- Alignments at index: [(4, 126)]	alignment score: 0.954
(4, 126) 50df0eebbd49b894df7e5e3945d66cd80c322284 5

In [176]:
compute_and_visualize_alignments(
    input_df=df_output, 
    target_year=2020, 
    period_start=34, 
    period_end=40, 
    target_dev_id=0, 
    top_k=1)

Year: 2020
Starting Week: 34
Ending Week: 40
Resulting dataframe records: 1156
No. of unique devs in this period: 536
- Producing developer fusion sequences...
- Computing alignments...
- Resulting alignments:
-- Alignments at index: [(12, 656), (11, 655), (10, 654), (9, 653), (8, 652), (7, 651), (6, 650), (5, 649), (4, 648), (3, 647), (2, 646), (1, 645), (0, 644)]	alignment score: 0.989
(12, 656) 36f30e486dce22345c2dd3a3ba439c12cd67f6ba 41af0d2ea1b03282b9f6da8b610ac30768893518
(11, 655) bc21a291fc11bbd60868c45b9f5a79ceed97fd4e 36f30e486dce22345c2dd3a3ba439c12cd67f6ba
(10, 654) b40341fad6cc2daa195f8090fd3348f18fff640a e27fec66f0a94e35a35548bd0b29ae616e62ec62
(9, 653) ed46cd1d4cc4b2cf05f31fe25fc68d1a9d3589ba e27fec66f0a94e35a35548bd0b29ae616e62ec62
(8, 652) c49a94405b39d3e3293da98f621fe4243f3cc4fa e27fec66f0a94e35a35548bd0b29ae616e62ec62
(7, 651) 22881adf85934f220764636f0ec79a6124e93f64 36cfec73595ccbaf245b8d6ab31dadbff3962346
(6, 650) bc9b9c5ab9d8d16157737db539929d57562926e9 d5be89a8d1

In [177]:
compute_and_visualize_alignments(
    input_df=df_output, 
    target_year=2020, 
    period_start=34, 
    period_end=40, 
    target_dev_id=3, 
    top_k=1)

Year: 2020
Starting Week: 34
Ending Week: 40
Resulting dataframe records: 1156
No. of unique devs in this period: 536
- Producing developer fusion sequences...
- Computing alignments...
- Resulting alignments:
-- Alignments at index: [(2, 535), (1, 534), (0, 533)]	alignment score: 0.998
(2, 535) 36f30e486dce22345c2dd3a3ba439c12cd67f6ba 36f30e486dce22345c2dd3a3ba439c12cd67f6ba
(1, 534) ed46cd1d4cc4b2cf05f31fe25fc68d1a9d3589ba ed46cd1d4cc4b2cf05f31fe25fc68d1a9d3589ba
(0, 533) 59c0d31988fb366189502a8ac66b7fe1486b7e40 bc9b9c5ab9d8d16157737db539929d57562926e9
- Visualizing results


## &#128218; References

1. numpy.kron, see [HERE](https://numpy.org/doc/stable/reference/generated/numpy.kron.html)
2. Kronecker product, see [HERE](https://en.wikipedia.org/wiki/Kronecker_product)
3. Louis-Philippe Morency and Tadas Baltrusaitis. Tutorial on Multimodal Machine Learning, see [HERE](https://www.cs.cmu.edu/~morency/MMML-Tutorial-ACL2017.pdf)
4. Baltrušaitis, T., Ahuja, C. and Morency, L.P., 2018. **Multimodal machine learning: A survey and taxonomy**. *IEEE transactions on pattern analysis and machine intelligence*, 41(2), pp.423-443., see [HERE](https://arxiv.org/abs/1705.09406)