# Load Modules

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Model

from tensorflow.keras.backend import expand_dims
from tensorflow.keras.layers import Conv2D, BatchNormalization, Reshape, MaxPooling2D, Flatten, Concatenate, Dropout
from tensorflow.keras.layers import Dense

import keras
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# Variables & Dictionaries

In [None]:
EMBEDDING_DIM = 20
MAX_NUM_WORDS = 250000

In [None]:
def calc_max_seq_len(data_df):
    titles = data_df['title']
    title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
    max_title_len = title_lengths.quantile(0.95, interpolation='higher')
    print(max_title_len)

    descriptions = data_df['description']
    desc_lengths = (descriptions.str.count(' ')+1).fillna(0).astype(np.int)
    max_desc_len = desc_lengths.quantile(0.95, interpolation='higher')
    print(max_desc_len)
    
    return max_title_len + max_desc_len

In [None]:
SOURCE = "apache"
## 'apache', 243k, 
# 'mojang', 200k, 
## 'redhat', 106k
# 'jira', 98k

# 'mongodb', 38k
# 'qt', 36k

# 'sakai', 19k
# 'hyperledger', 16k 
# 'mariadb', 15k
# 'spring', 14k
# 'jiraecosystem', 11k

# 'sonatype', 4k, small
# 'jfrog', 3k, small
#'inteldaos', 3k too small DOES NOT WORK without properties

# 'secondlife', 630 too small
## 'mindville', 44 too small DOES NOT WORK

NL = True

In [None]:
type_dict={'Backports': 'Backport', 
           
    'Blocked': 'Block',
    'Blocker': 'Block',
    'Blocks': 'Block',
           
    'Bonfire Testing': 'Bonfire Testing', 
    'Bonfire testing': 'Bonfire Testing', 
    'Git Code Review': 'Bonfire Testing', 
    'Testing': 'Bonfire Testing',
           
    'Causality': 'Cause', 
    'Cause': 'Cause',
    'Caused': 'Cause', 
    'Problem/Incident': 'Cause',
           
    'Child-Issue': 'Parent-Child', 
    'Parent Feature': 'Parent-Child',
    'Parent/Child': 'Parent-Child',
    'multi-level hierarchy [GANTT]': 'Parent-Child',
    'Parent-Relation': 'Parent-Child',
           
    'Cloners': 'Clone', 
    'Cloners (old)': 'Clone', 
           
    'Collection': 'Incorporate', 
    'Container': 'Incorporate',
    'Contains(WBSGantt)': 'Incorporate', 
    'Incorporate': 'Incorporate', 
    'Incorporates': 'Incorporate', 
    'Part': 'Incorporate',
    'PartOf': 'Incorporate',
    'Superset': 'Incorporate', 
           
    'Completes': 'Fix', 
    'Fixes': 'Fix',
    'Resolve': 'Fix',
           
    'Depend': 'Depend', 
    'Dependency': 'Depend', 
    'Dependent': 'Depend', 
    'Depends': 'Depend', 
    'Gantt Dependency': 'Depend',
    'dependent': 'Depend',
           
    'Derived': 'Derive',
           
    'Detail': 'Detail', 
           
    'Documentation': 'Documented',
    'Documented': 'Documented',
    
    'Duplicate': 'Duplicate',
           
    'Epic': 'Epic', 
    'Epic-Relation': 'Epic',
    
    'Finish-to-Finish link (WBSGantt)': 'finish-finish', 
    'Gantt End to End': 'finish-finish', 
    'Gantt: finish-finish': 'finish-finish',
    'finish-finish [GANTT]': 'finish-finish', 
    
    'Gantt End to Start': 'finish-start', 
    'Gantt: finish-start': 'finish-start',
    'finish-start [GANTT]': 'finish-start',

    'Gantt Start to Start': 'start-start', 
    
    'Gantt: start-finish': 'start-finish', 
    
    'Follows': 'Follow', 
    'Sequence': 'Follow', 
    
    'Implement': 'Implement', 
    'Implements': 'Implements', 
    
    'Issue split': 'Split',
    'Split': 'Split',
    'Work Breakdown': 'Split',
    
    'Preceded By': 'Precede', 
    
    'Reference': 'Relate',
    'Relate': 'Relate',
    'Related': 'Relate', 
    'Relates': 'Relate',
    'Relationship': 'Relate',
    
    'Regression': 'Breaks', 
    
    'Replacement': 'Replace',
    
    'Required': 'Require', 
    
    'Supercedes': 'Supercede',
    'Supersede': 'Supercede',
    'Supersession': 'Supercede', 
    
    'Subtask': 'Subtask',
    
    'Test': 'Test', 
    'Tested': 'Test',
    
    'Trigger': 'Trigger', 
    
    'Non-Link': 'Non-Link',
          
    '1 - Relate': 'Relate',
'Subtask': 'Subtask',
'5 - Depend':   'Depend',          
'3 - Duplicate': 'Duplicate',          
'4 - Incorporate': 'Incorporate',        
'2 - Cloned': 'Clone',               
'6 - Blocks': 'Block',                
'7 - Git Code Review': 'Bonfire Testing',
          'Verify': 'Verify'}

cat_dict = {'Block': 'Causal',
    'Bonfire Testing': 'Workflow',
    'Breaks': 'Causal',
    'Cause': 'Causal',
    'Clone': 'General',
    'Depend': 'Causal',
    'Detail': 'Workflow',
    'Documented': 'Workflow',
    'Duplicate': 'General',
    'Epic': 'Epic',
    'Fix': 'Workflow',
    'Follow': 'Causal',
    'Incorporate': 'Split',
    'Parent-Child': 'Split',
    'Relate': 'General',
    'Replace': 'General',
    'Require': 'Causal',
    'Split': 'Split',
    'Subtask': 'Split',
    'Supercede': 'Causal',
    'Trigger': 'Workflow',
    'finish-start': 'Causal',
    'Non-Link': 'Non-Link',
    'Verify': 'Workflow'
           }

## Load Data

In [None]:
filename = '../../data/processed/links_'+SOURCE.lower()+'.csv'
links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=";")
print("Number of Links: " + str(len(links)))
print("Feature Size: "+str(len(list(links.columns.values))))

In [None]:
links.linktype.value_counts()

In [None]:
#Loading Issues & Links
filename = '../../data/processed/issues_'+SOURCE.lower()+'.csv'
issues = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=['issue_id'], sep=";")
print("Number of Issues: " + str(len(issues)))
print("Feature Size: " + str(len(list(issues.columns.values))))

if NL:
    filename = '../../data/processed/links_plus_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=";")
    print("Number of Links: " + str(len(links)))
    print("Feature Size: "+str(len(list(links.columns.values))))
else:
    filename = '../../data/processed/links_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=";")
    print("Number of Links: " + str(len(links)))
    print("Feature Size: "+str(len(list(links.columns.values))))

In [None]:
MAX_SEQUENCE_LENGTH = calc_max_seq_len(issues)
print(MAX_SEQUENCE_LENGTH)

## Issue preprocess

In [None]:
text_data = np.load('w2v/text_data_'+SOURCE+'.npy')
issues['text_emb']=list(text_data)

## Links preprocess

In [None]:
links.linktype.value_counts()

In [None]:
links['mappedtype'] = links['linktype'].map(type_dict)
links.mappedtype.value_counts()

In [None]:
links.mappedtype.value_counts()>=len(links)*0.01

In [None]:
linktypes = (links.mappedtype.value_counts()>=len(links)*0.01).rename_axis('mappedtype').reset_index(name='valid')
valid_types = set(linktypes[linktypes['valid']==True]['mappedtype'])

all_data = links[(links["mappedtype"].isin(valid_types))]

all_data['label'] = all_data['mappedtype'].factorize()[0].astype(int)

In [None]:
category_id_df = all_data[['mappedtype', 'label']].drop_duplicates().sort_values('label')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['label', 'mappedtype']].values)

print("Categories: "+str(len(id_to_category)))
print("Categories: "+str(id_to_category))

## Features / Embedding Matrix

In [None]:
model_CBOW = Word2Vec.load('w2v/'+SOURCE.lower()+'W2V.model')

embedding_matrix = np.zeros((len(model_CBOW.wv), EMBEDDING_DIM))
for i in range(len(model_CBOW.wv)):
    embedding_vector = model_CBOW.wv[model_CBOW.wv.index_to_key[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector  

In [None]:
issue_feat_data = issues[['text_emb']]

all_data = all_data.merge(issue_feat_data, left_on='issue_id_1', right_on='issue_id')
all_data = all_data.merge(issue_feat_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))

In [None]:
## Sanity Check
all_data.isnull().any()

# Deep Learning Models

In [None]:
SOURCE + "_LT"
if NL:
    SOURCE = SOURCE + "_plus"

In [None]:
print("Creating functions.")
def plot_history(history):
#     keys = history.history.keys()
    for i in list(history.history)[0:2]:
        print(i)
        # list all data in history
        # summarize history for accuracy
        plt.plot(history.history[i])
#         plt.plot(history.history['val_'+i])
        plt.title('model '+ i)
        plt.ylabel(i)
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

### Embedding Model

In [None]:
print("Disabling eager execution.")
tf.compat.v1.disable_eager_execution()

print("Models incoming.")
embedding_layer = Embedding(len(model_CBOW.wv),
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# from keras.layers import concatenate
text_in = Input(shape = (MAX_SEQUENCE_LENGTH,), name = 'Text_Input')
text_out = embedding_layer(text_in)

text_embedding = Model(inputs = [text_in], outputs = [text_out], name = 'Text_Output')
text_embedding.summary()

### SC-CNN Model

In [None]:
def SCCNN_Model():
    text_in = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='Text_Input')

    text_out = text_embedding([text_in])
    
    text_out = expand_dims(text_out, axis=-1)

     # A branch
    conv_a = Conv2D(filters=100,
                    kernel_size=(2,20),
                    strides=(1,1),
                    activation='relu',
                    name="BranchA"
                   )(text_out)
    
    conv_a = BatchNormalization(axis=-1)(conv_a)

    conv_a_rs = Reshape((MAX_SEQUENCE_LENGTH-1,100,1))(conv_a)

    conv_a_1 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    strides=(1,1),
                    activation = 'relu',
                    name="BranchA1"
                    )(conv_a_rs)

    pooled_conv_a_1 = MaxPooling2D(pool_size=(conv_a_1.shape[1], 1), padding='valid')(conv_a_1)
#     pooled_conv_a_1 = GlobalMaxPooling2D()(conv_a_1)
    pooled_conv_a_1 = Flatten()(pooled_conv_a_1)

    conv_a_2 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchA2"
                    )(conv_a_rs)

    pooled_conv_a_2 = MaxPooling2D(pool_size=(conv_a_2.shape[1], 1), padding='valid')(conv_a_2)
#     pooled_conv_a_2 = GlobalMaxPooling2D()(conv_a_2)
    pooled_conv_a_2 = Flatten()(pooled_conv_a_2)

    conv_a_3 = Conv2D(filters = 200,
                    kernel_size = (4,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchA3"
                    )(conv_a_rs)

    pooled_conv_a_3 = MaxPooling2D(pool_size=(conv_a_3.shape[1], 1), padding='valid')(conv_a_3)
#     pooled_conv_a_3 = GlobalMaxPooling2D()(conv_a_3)
    pooled_conv_a_3 = Flatten()(pooled_conv_a_3)
    
    A = Concatenate(axis=-1)([pooled_conv_a_1,pooled_conv_a_2])
    A = Concatenate(axis=-1)([A,pooled_conv_a_3])
    
    # B branch
    conv_b = Conv2D(filters=100,
                    kernel_size=(3,20),
                    activation='relu',
                    strides=(1,1),
                    name="BranchB"
                    )(text_out)

    conv_b = BatchNormalization(axis=-1)(conv_b)

    conv_b_rs = Reshape((MAX_SEQUENCE_LENGTH-2,100,1))(conv_b)

    conv_b_1 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB1"
                    )(conv_b_rs)

    pooled_conv_b_1 = MaxPooling2D(pool_size=(conv_b_1.shape[1], 1), padding='valid')(conv_b_1)
#     pooled_conv_b_1 = GlobalMaxPooling2D()(conv_b_1)
    pooled_conv_b_1 = Flatten()(pooled_conv_b_1)

    conv_b_2 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB2"
                    )(conv_b_rs)

    pooled_conv_b_2 = MaxPooling2D(pool_size=(conv_b_2.shape[1], 1), padding='valid')(conv_b_2)
#     pooled_conv_b_2 = GlobalMaxPooling2D()(conv_b_2)
    pooled_conv_b_2 = Flatten()(pooled_conv_b_2)

    conv_b_3 = Conv2D(filters = 200,
                    kernel_size = (4,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB3"
                    )(conv_b_rs)

    pooled_conv_b_3 = MaxPooling2D(pool_size=(conv_b_3.shape[1], 1), padding='valid')(conv_b_3)
#     pooled_conv_b_3 = GlobalMaxPooling2D()(conv_b_3)
    pooled_conv_b_3 = Flatten()(pooled_conv_b_3)
    
    B = Concatenate(axis=-1)([pooled_conv_b_1,pooled_conv_b_2])
    B = Concatenate(axis=-1)([B,pooled_conv_b_3])

    # C branch
    conv_c = Conv2D(filters=100,
                    kernel_size=(4,20),
                    activation='relu',
                    strides=(1,1),
                    name="BranchC"
                    )(text_out)
    conv_c = BatchNormalization(axis=-1)(conv_c)

    conv_c_rs = Reshape((MAX_SEQUENCE_LENGTH-3,100,1))(conv_c)

    conv_c_1 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC1"
                    )(conv_c_rs)


    pooled_conv_c_1 = MaxPooling2D(pool_size=(conv_c_1.shape[1], 1), padding='valid')(conv_c_1)
#     pooled_conv_c_1 = GlobalMaxPooling2D()(conv_c_1)
    pooled_conv_c_1 = Flatten()(pooled_conv_c_1)

    conv_c_2 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC2"
                    )(conv_c_rs)

    pooled_conv_c_2 = MaxPooling2D(pool_size=(conv_c_2.shape[1], 1), padding='valid')(conv_c_2)
#     pooled_conv_c_2 = GlobalMaxPooling2D()(conv_c_2)
    pooled_conv_c_2 = Flatten()(pooled_conv_c_2)

    conv_c_3 = Conv2D(filters = 200,
                    kernel_size = (4,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC3"
                    )(conv_c_rs)

    pooled_conv_c_3 = MaxPooling2D(pool_size=(conv_c_3.shape[1], 1), padding='valid')(conv_c_3)
#     pooled_conv_c_3 = GlobalMaxPooling2D()(conv_c_3)
    pooled_conv_c_3 = Flatten()(pooled_conv_c_3)
    
    C = Concatenate(axis=-1)([pooled_conv_c_1,pooled_conv_c_2])
    C = Concatenate(axis=-1)([C,pooled_conv_c_3])

    conv_concat = Concatenate(axis=-1)([A,B])
    conv_concat = Concatenate(axis=-1)([conv_concat,C])

    issue_model = Model(inputs = [text_in], outputs = [conv_concat], name = 'SC-CNN_Model')

#     issue_model.summary()
    return issue_model

### Complete Model

In [None]:
def get_model():
    text_in_a = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype='int32', name="Text_issue_a")
    text_in_b = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype='int32', name="Text_issue_b")

    issue_model = SCCNN_Model()

    encoded_issue_a = issue_model([text_in_a])
    encoded_issue_b = issue_model([text_in_b])

    merged_vector = Concatenate()([encoded_issue_a, encoded_issue_b])

    concat = Dropout(0.5)(merged_vector)
    concat = Dense(units = 512, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 256, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 128, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 64, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
       
    model = Model(inputs=[text_in_a, text_in_b], outputs=concat)    
    return model

### Train Model

In [None]:
train, test = train_test_split(all_data, test_size = 0.2, random_state = 9)
print(len(train))
print(len(test))

In [None]:
def train_model(dataset):
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=2)
    
    class_weight = compute_class_weight(
                      class_weight='balanced',
                      classes=range(len(id_to_category)),
                      y=dataset['label']
                    )

    model = get_model()
    
    model.compile(optimizer='adam',
#                       Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0), 
                      loss=tfa.losses.TripletSemiHardLoss())

    train_issue_1 = dataset['text_emb_1']
    train_issue_1 = np.array(train_issue_1.values.tolist())

    train_issue_2 = dataset['text_emb_2']
    train_issue_2 = np.array(train_issue_2.values.tolist())
    
    history = model.fit([train_issue_1, train_issue_2], 
                        y=dataset['label'], 
                            callbacks=[callback], 
                            validation_split=0.1, 
#                         class_weight = dict(enumerate(class_weight)), 
                        batch_size=128, epochs=64, verbose=2)
    
    plot_history(history)
    
    return model

In [None]:
%%time
class_model = train_model(train)

## Prediction

In [None]:
issue_1_train = train['text_emb_1']
issue_1_train = np.array(issue_1_train.values.tolist())

issue_2_train = train['text_emb_2']
issue_2_train = np.array(issue_2_train.values.tolist())

results_train = class_model.predict([issue_1_train, issue_2_train])

In [None]:
issue_1_test = test['text_emb_1']
issue_1_test = np.array(issue_1_test.values.tolist())

issue_2_test = test['text_emb_2']
issue_2_test = np.array(issue_2_test.values.tolist())

results_test = class_model.predict([issue_1_test, issue_2_test])

In [None]:
np.save(file='embeddings/train_embed_LT_'+SOURCE+'.npy', arr=results_train)
np.save(file='embeddings/test_embed_LT_'+SOURCE+'.npy', arr=results_test)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=25, random_state=0)

clf.fit(results_train, train['label'])

In [None]:
probs = clf.predict_proba(results_test)
preds = clf.predict(results_test)

test['preds_SVM'] = preds

In [None]:
train[['name', 'linktype', 'issue_id_1', 'issue_id_2', 'mappedtype', 'label']].to_csv('embeddings/train_df_LT'+SOURCE+'.csv')
test[['name', 'linktype', 'issue_id_1', 'issue_id_2', 'mappedtype', 'label', 'preds_SVM']].to_csv('embeddings/test_df_LT'+SOURCE+'.csv')

In [None]:
class_rep = classification_report(test['label'], test['preds_SVM'], output_dict=True, target_names=category_id_df.mappedtype.to_list())
class_rep_df = pd.DataFrame(class_rep).transpose()
print(class_rep_df)

conf_mat = confusion_matrix(test['preds_SVM'], test['label'])
conf_mat_df = pd.DataFrame(conf_mat).transpose()
conf_mat_df.rename(index=id_to_category, inplace=True)
conf_mat_df.rename(columns=id_to_category, inplace=True)
print(conf_mat_df)

class_rep_df.to_csv('results/class_rep_LT_'+SOURCE+"_SCCNN.csv")
conf_mat_df.to_csv('results/conf_mat_LT_'+SOURCE+"_SCCNN.csv")