# Load Modules

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Model

from tensorflow.keras.backend import expand_dims
from tensorflow.keras.layers import Conv2D, BatchNormalization, Reshape, MaxPooling2D, Flatten, Concatenate, Dropout
from tensorflow.keras.layers import Dense

import keras
import tensorflow_addons as tfa

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

 The versions of TensorFlow you are currently using is 2.7.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


# Variables & Dictionaries

In [2]:
EMBEDDING_DIM = 20
MAX_NUM_WORDS = 250000

In [3]:
def calc_max_seq_len(data_df):
    titles = data_df['title']
    title_lengths = (titles.str.count(' ')+1).fillna(0).astype(np.int)
    max_title_len = title_lengths.quantile(0.95, interpolation='higher')
    print(max_title_len)

    descriptions = data_df['description']
    desc_lengths = (descriptions.str.count(' ')+1).fillna(0).astype(np.int)
    max_desc_len = desc_lengths.quantile(0.95, interpolation='higher')
    print(max_desc_len)
    
    return max_title_len + max_desc_len

In [4]:
SOURCE = "redhat"
# 'apache', 243k, 
# 'redhat', 106k
## 'jira', 98k
## 'mojang', 200k

## 'mongodb', 38k
## 'qt', 36k

## 'sakai', 19k
## 'hyperledger', 16k 
## 'mariadb', 15k
## 'spring', 14k
## 'jiraecosystem', 11k

## 'sonatype', 4k, small
## 'jfrog', 3k, small
## 'inteldaos', 3k too small DOES NOT WORK without properties

## 'secondlife', 630 too small
## 'mindville', 44 too small DOES NOT WORK

NL = True

In [5]:
type_dict={'Backports': 'Backport', 
           
    'Blocked': 'Block',
    'Blocker': 'Block',
    'Blocks': 'Block',
           
    'Bonfire Testing': 'Bonfire Testing', 
    'Bonfire testing': 'Bonfire Testing', 
    'Git Code Review': 'Bonfire Testing', 
    'Testing': 'Bonfire Testing',
           
    'Causality': 'Cause', 
    'Cause': 'Cause',
    'Caused': 'Cause', 
    'Problem/Incident': 'Cause',
           
    'Child-Issue': 'Parent-Child', 
    'Parent Feature': 'Parent-Child',
    'Parent/Child': 'Parent-Child',
    'multi-level hierarchy [GANTT]': 'Parent-Child',
    'Parent-Relation': 'Parent-Child',
           
    'Cloners': 'Clone', 
    'Cloners (old)': 'Clone', 
           
    'Collection': 'Incorporate', 
    'Container': 'Incorporate',
    'Contains(WBSGantt)': 'Incorporate', 
    'Incorporate': 'Incorporate', 
    'Incorporates': 'Incorporate', 
    'Part': 'Incorporate',
    'PartOf': 'Incorporate',
    'Superset': 'Incorporate', 
           
    'Completes': 'Fix', 
    'Fixes': 'Fix',
    'Resolve': 'Fix',
           
    'Depend': 'Depend', 
    'Dependency': 'Depend', 
    'Dependent': 'Depend', 
    'Depends': 'Depend', 
    'Gantt Dependency': 'Depend',
    'dependent': 'Depend',
           
    'Derived': 'Derive',
           
    'Detail': 'Detail', 
           
    'Documentation': 'Documented',
    'Documented': 'Documented',
    
    'Duplicate': 'Duplicate',
           
    'Epic': 'Epic', 
    'Epic-Relation': 'Epic',
    
    'Finish-to-Finish link (WBSGantt)': 'finish-finish', 
    'Gantt End to End': 'finish-finish', 
    'Gantt: finish-finish': 'finish-finish',
    'finish-finish [GANTT]': 'finish-finish', 
    
    'Gantt End to Start': 'finish-start', 
    'Gantt: finish-start': 'finish-start',
    'finish-start [GANTT]': 'finish-start',

    'Gantt Start to Start': 'start-start', 
    
    'Gantt: start-finish': 'start-finish', 
    
    'Follows': 'Follow', 
    'Sequence': 'Follow', 
    
    'Implement': 'Implement', 
    'Implements': 'Implements', 
    
    'Issue split': 'Split',
    'Split': 'Split',
    'Work Breakdown': 'Split',
    
    'Preceded By': 'Precede', 
    
    'Reference': 'Relate',
    'Relate': 'Relate',
    'Related': 'Relate', 
    'Relates': 'Relate',
    'Relationship': 'Relate',
    
    'Regression': 'Breaks', 
    
    'Replacement': 'Replace',
    
    'Required': 'Require', 
    
    'Supercedes': 'Supercede',
    'Supersede': 'Supercede',
    'Supersession': 'Supercede', 
    
    'Subtask': 'Subtask',
    
    'Test': 'Test', 
    'Tested': 'Test',
    
    'Trigger': 'Trigger', 
    
    'Non-Link': 'Non-Link',
          
    '1 - Relate': 'Relate',
'Subtask': 'Subtask',
'5 - Depend':   'Depend',          
'3 - Duplicate': 'Duplicate',          
'4 - Incorporate': 'Incorporate',        
'2 - Cloned': 'Clone',               
'6 - Blocks': 'Block',                
'7 - Git Code Review': 'Bonfire Testing',
          'Verify': 'Verify'}

cat_dict = {'Block': 'Causal',
    'Bonfire Testing': 'Workflow',
    'Breaks': 'Causal',
    'Cause': 'Causal',
    'Clone': 'General',
    'Depend': 'Causal',
    'Detail': 'Workflow',
    'Documented': 'Workflow',
    'Duplicate': 'General',
    'Epic': 'Epic',
    'Fix': 'Workflow',
    'Follow': 'Causal',
    'Incorporate': 'Split',
    'Parent-Child': 'Split',
    'Relate': 'General',
    'Replace': 'General',
    'Require': 'Causal',
    'Split': 'Split',
    'Subtask': 'Split',
    'Supercede': 'Causal',
    'Trigger': 'Workflow',
    'finish-start': 'Causal',
    'Non-Link': 'Non-Link',
    'Verify': 'Workflow'
           }

## Load Data

In [6]:
#Loading Issues & Links
filename = '../../data/processed/issues_'+SOURCE.lower()+'.csv'
issues = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=['issue_id'], sep=";")
print("Number of Issues: " + str(len(issues)))
print("Feature Size: " + str(len(list(issues.columns.values))))

if NL:
    filename = '../../data/processed/links_plus_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=";")
    print("Number of Links: " + str(len(links)))
    print("Feature Size: "+str(len(list(links.columns.values))))
else:
    filename = '../../data/processed/links_'+SOURCE.lower()+'.csv'
    links = pd.read_csv(filename, encoding="UTF-8", low_memory=False, index_col=0, sep=";")
    print("Number of Links: " + str(len(links)))
    print("Feature Size: "+str(len(list(links.columns.values))))

Number of Issues: 353000
Feature Size: 10
Number of Links: 125367
Feature Size: 5


In [7]:
MAX_SEQUENCE_LENGTH = calc_max_seq_len(issues)
print(MAX_SEQUENCE_LENGTH)

15
282
297


## Issue preprocess

In [8]:
text_data = np.load('w2v/text_data_'+SOURCE+'.npy')
issues['text_emb']=list(text_data)

## Links preprocess

In [9]:
links.linktype.value_counts()

Related                          31006
Subtask                          24928
Blocks                           18186
Cloners                          16969
Superset                         10661
Duplicate                         5913
Non-Link                          5698
Sequence                          5129
Causality                         3168
Cloners (old)                     1504
Documentation                      670
Parent-Relation                    621
multi-level hierarchy [GANTT]      251
finish-start [GANTT]               184
Account                            166
Issue split                        162
Gantt: finish-start                 46
Gantt: start-finish                 40
finish-finish [GANTT]               40
Gantt: finish-finish                20
start-finish [GANTT]                 4
Gantt: start-start                   1
Name: linktype, dtype: int64

In [10]:
links['mappedtype'] = links['linktype'].map(type_dict)
links.mappedtype.value_counts()

Relate           31006
Subtask          24928
Clone            18473
Block            18186
Incorporate      10661
Duplicate         5913
Non-Link          5698
Follow            5129
Cause             3168
Parent-Child       872
Documented         670
finish-start       230
Split              162
finish-finish       60
start-finish        40
Name: mappedtype, dtype: int64

In [11]:
links.mappedtype.value_counts()>=len(links)*0.01

Relate            True
Subtask           True
Clone             True
Block             True
Incorporate       True
Duplicate         True
Non-Link          True
Follow            True
Cause             True
Parent-Child     False
Documented       False
finish-start     False
Split            False
finish-finish    False
start-finish     False
Name: mappedtype, dtype: bool

In [12]:
linktypes = (links.mappedtype.value_counts()>=len(links)*0.01).rename_axis('mappedtype').reset_index(name='valid')
valid_types = set(linktypes[linktypes['valid']==True]['mappedtype'])

all_data = links[(links["mappedtype"].isin(valid_types))]

all_data['label'] = all_data['mappedtype'].factorize()[0].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_data['label'] = all_data['mappedtype'].factorize()[0].astype(int)


In [13]:
category_id_df = all_data[['mappedtype', 'label']].drop_duplicates().sort_values('label')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['label', 'mappedtype']].values)

print("Categories: "+str(len(id_to_category)))
print("Categories: "+str(id_to_category))

Categories: 9
Categories: {0: 'Block', 1: 'Incorporate', 2: 'Cause', 3: 'Clone', 4: 'Relate', 5: 'Subtask', 6: 'Follow', 7: 'Duplicate', 8: 'Non-Link'}


## Features / Embedding Matrix

In [14]:
model_CBOW = Word2Vec.load('w2v/'+SOURCE+'W2V.model')

embedding_matrix = np.zeros((len(model_CBOW.wv), EMBEDDING_DIM))
for i in range(len(model_CBOW.wv)):
    embedding_vector = model_CBOW.wv[model_CBOW.wv.index_to_key[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector  

In [15]:
issue_feat_data = issues[['text_emb']]

all_data = all_data.merge(issue_feat_data, left_on='issue_id_1', right_on='issue_id')
all_data = all_data.merge(issue_feat_data, left_on='issue_id_2', right_on='issue_id', suffixes=('_1', '_2'))

In [16]:
## Sanity Check
all_data.isnull().any()

name          False
linktype      False
issue_id_1    False
issue_id_2    False
issues        False
mappedtype    False
label         False
text_emb_1    False
text_emb_2    False
dtype: bool

# Deep Learning Models

In [17]:
SOURCE + "_LT"
if NL:
    SOURCE = SOURCE + "_plus"

In [18]:
print("Creating functions.")
def plot_history(history):
#     keys = history.history.keys()
    for i in list(history.history)[0:2]:
        print(i)
        # list all data in history
        # summarize history for accuracy
        plt.plot(history.history[i])
#         plt.plot(history.history['val_'+i])
        plt.title('model '+ i)
        plt.ylabel(i)
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

Creating functions.


### Embedding Model

In [19]:
print("Disabling eager execution.")
tf.compat.v1.disable_eager_execution()

print("Models incoming.")
embedding_layer = Embedding(len(model_CBOW.wv),
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
# from keras.layers import concatenate
text_in = Input(shape = (MAX_SEQUENCE_LENGTH,), name = 'Text_Input')
text_out = embedding_layer(text_in)

text_embedding = Model(inputs = [text_in], outputs = [text_out], name = 'Text_Output')
text_embedding.summary()

Disabling eager execution.
Models incoming.
Model: "Text_Output"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Text_Input (InputLayer)     [(None, 297)]             0         
                                                                 
 embedding (Embedding)       (None, 297, 20)           12859000  
                                                                 
Total params: 12,859,000
Trainable params: 0
Non-trainable params: 12,859,000
_________________________________________________________________


### SC-CNN Model

In [20]:
def DCCNN_Model():
    text_in = Input(shape=(MAX_SEQUENCE_LENGTH, 20, 2), dtype='float32', name='Text_Input')  

     # A branch
    conv_a = Conv2D(filters=100,
                    kernel_size=(1,20),
                    strides=(1,1),
                    activation='relu',
                    name="BranchA"
                   )(text_in)
    
    conv_a = BatchNormalization(axis=-1)(conv_a)

    conv_a_rs = Reshape((MAX_SEQUENCE_LENGTH,100,1))(conv_a)

    conv_a_1 = Conv2D(filters = 200,
                    kernel_size = (1,100),
                    strides=(1,1),
                    activation = 'relu',
                    name="BranchA1"
                    )(conv_a_rs)

    pooled_conv_a_1 = MaxPooling2D(pool_size=(conv_a_1.shape[1], 1), padding='valid')(conv_a_1)
#     pooled_conv_a_1 = GlobalMaxPooling2D()(conv_a_1)
    pooled_conv_a_1 = Flatten()(pooled_conv_a_1)

    conv_a_2 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchA2"
                    )(conv_a_rs)

    pooled_conv_a_2 = MaxPooling2D(pool_size=(conv_a_2.shape[1], 1), padding='valid')(conv_a_2)
#     pooled_conv_a_2 = GlobalMaxPooling2D()(conv_a_2)
    pooled_conv_a_2 = Flatten()(pooled_conv_a_2)

    conv_a_3 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchA3"
                    )(conv_a_rs)

    pooled_conv_a_3 = MaxPooling2D(pool_size=(conv_a_3.shape[1], 1), padding='valid')(conv_a_3)
#     pooled_conv_a_3 = GlobalMaxPooling2D()(conv_a_3)
    pooled_conv_a_3 = Flatten()(pooled_conv_a_3)
    
    A = Concatenate(axis=-1)([pooled_conv_a_1,pooled_conv_a_2])
    A = Concatenate(axis=-1)([A,pooled_conv_a_3])
    
    # B branch
    conv_b = Conv2D(filters=100,
                    kernel_size=(2,20),
                    activation='relu',
                    strides=(1,1),
                    name="BranchB"
                    )(text_in)

    conv_b = BatchNormalization(axis=-1)(conv_b)

    conv_b_rs = Reshape((MAX_SEQUENCE_LENGTH-1,100,1))(conv_b)

    conv_b_1 = Conv2D(filters = 200,
                    kernel_size = (1,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB1"
                    )(conv_b_rs)

    pooled_conv_b_1 = MaxPooling2D(pool_size=(conv_b_1.shape[1], 1), padding='valid')(conv_b_1)
#     pooled_conv_b_1 = GlobalMaxPooling2D()(conv_b_1)
    pooled_conv_b_1 = Flatten()(pooled_conv_b_1)

    conv_b_2 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB2"
                    )(conv_b_rs)

    pooled_conv_b_2 = MaxPooling2D(pool_size=(conv_b_2.shape[1], 1), padding='valid')(conv_b_2)
#     pooled_conv_b_2 = GlobalMaxPooling2D()(conv_b_2)
    pooled_conv_b_2 = Flatten()(pooled_conv_b_2)

    conv_b_3 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchB3"
                    )(conv_b_rs)

    pooled_conv_b_3 = MaxPooling2D(pool_size=(conv_b_3.shape[1], 1), padding='valid')(conv_b_3)
#     pooled_conv_b_3 = GlobalMaxPooling2D()(conv_b_3)
    pooled_conv_b_3 = Flatten()(pooled_conv_b_3)
    
    B = Concatenate(axis=-1)([pooled_conv_b_1,pooled_conv_b_2])
    B = Concatenate(axis=-1)([B,pooled_conv_b_3])

    # C branch
    conv_c = Conv2D(filters=100,
                    kernel_size=(3,20),
                    activation='relu',
                    strides=(1,1),
                    name="BranchC"
                    )(text_in)
    conv_c = BatchNormalization(axis=-1)(conv_c)

    conv_c_rs = Reshape((MAX_SEQUENCE_LENGTH-2,100,1))(conv_c)

    conv_c_1 = Conv2D(filters = 200,
                    kernel_size = (1,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC1"
                    )(conv_c_rs)


    pooled_conv_c_1 = MaxPooling2D(pool_size=(conv_c_1.shape[1], 1), padding='valid')(conv_c_1)
#     pooled_conv_c_1 = GlobalMaxPooling2D()(conv_c_1)
    pooled_conv_c_1 = Flatten()(pooled_conv_c_1)

    conv_c_2 = Conv2D(filters = 200,
                    kernel_size = (2,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC2"
                    )(conv_c_rs)

    pooled_conv_c_2 = MaxPooling2D(pool_size=(conv_c_2.shape[1], 1), padding='valid')(conv_c_2)
#     pooled_conv_c_2 = GlobalMaxPooling2D()(conv_c_2)
    pooled_conv_c_2 = Flatten()(pooled_conv_c_2)

    conv_c_3 = Conv2D(filters = 200,
                    kernel_size = (3,100),
                    activation = 'relu',
                    strides=(1,1),
                    name="BranchC3"
                    )(conv_c_rs)

    pooled_conv_c_3 = MaxPooling2D(pool_size=(conv_c_3.shape[1], 1), padding='valid')(conv_c_3)
#     pooled_conv_c_3 = GlobalMaxPooling2D()(conv_c_3)
    pooled_conv_c_3 = Flatten()(pooled_conv_c_3)
    
    C = Concatenate(axis=-1)([pooled_conv_c_1,pooled_conv_c_2])
    C = Concatenate(axis=-1)([C,pooled_conv_c_3])

    conv_concat = Concatenate(axis=-1)([A,B])
    conv_concat = Concatenate(axis=-1)([conv_concat,C])

    link_model = Model(inputs = [text_in], outputs = [conv_concat], name = 'DC-CNN_Model')

#     issue_model.summary()
    return link_model

### Complete Model

In [21]:
def get_model():
    text_in_a = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype='int32', name="Text_issue_a")
    text_in_b = Input(shape = (MAX_SEQUENCE_LENGTH,), dtype='int32', name="Text_issue_b")
    
    text_out_a = text_embedding([text_in_a])
    text_out_b = text_embedding([text_in_b])
    
    text_out_a = Reshape((MAX_SEQUENCE_LENGTH,20,1))(text_out_a)
    text_out_b = Reshape((MAX_SEQUENCE_LENGTH,20,1))(text_out_b)

    link = Concatenate(axis=-1)([text_out_a, text_out_b])
    
    print(link.shape)
    
    link_model = DCCNN_Model()
    
    merged_vector = link_model([link])

    concat = Dropout(0.5)(merged_vector)
    concat = Dense(units = 512, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 256, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 128, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
    
    concat = Dropout(0.5)(concat)
    concat = Dense(units = 64, 
                activation = 'relu',
                )(concat)
    concat = BatchNormalization(axis=-1)(concat)
       
    model = Model(inputs=[text_in_a, text_in_b], outputs=concat)    
    return model

### Train Model

In [22]:
train, test = train_test_split(all_data, test_size = 0.2, random_state = 9)
print(len(train))
print(len(test))

98529
24633


In [23]:
def train_model(dataset):
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=2)
    
    class_weight = compute_class_weight(
                      class_weight='balanced',
                      classes=range(len(id_to_category)),
                      y=dataset['label']
                    )

    model = get_model()
    
    model.compile(optimizer='adam',
#                       Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0), 
                      loss=tfa.losses.TripletSemiHardLoss())

    train_issue_1 = dataset['text_emb_1']
    train_issue_1 = np.array(train_issue_1.values.tolist())

    train_issue_2 = dataset['text_emb_2']
    train_issue_2 = np.array(train_issue_2.values.tolist())
    
    history = model.fit([train_issue_1, train_issue_2], 
                        y=dataset['label'], 
                            callbacks=[callback], 
                            validation_split=0.1, 
#                         class_weight = dict(enumerate(class_weight)), 
                        batch_size=128, epochs=64, verbose=2)
    
    plot_history(history)
    
    return model

In [None]:
%%time
class_model = train_model(train)

(None, 297, 20, 2)
Instructions for updating:
Colocations handled automatically by placer.
Train on 88676 samples, validate on 9853 samples
Epoch 1/64


2022-02-24 10:44:06.564320: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-02-24 10:44:07.119411: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22841 MB memory:  -> device: 0, name: Quadro RTX 6000, pci bus id: 0000:3e:00.0, compute capability: 7.5
2022-02-24 10:44:10.062331: I tensorflow/stream_executor/cuda/cuda_dnn.cc:366] Loaded cuDNN version 8302
  updates = self.state_updates


88676/88676 - 1062s - loss: 0.8449 - val_loss: 0.7336 - 1062s/epoch - 12ms/sample
Epoch 2/64
88676/88676 - 1042s - loss: 0.7770 - val_loss: 0.7379 - 1042s/epoch - 12ms/sample
Epoch 3/64
88676/88676 - 1043s - loss: 0.7431 - val_loss: 0.7261 - 1043s/epoch - 12ms/sample
Epoch 4/64
88676/88676 - 1042s - loss: 0.7106 - val_loss: 0.6947 - 1042s/epoch - 12ms/sample
Epoch 5/64
88676/88676 - 1042s - loss: 0.6851 - val_loss: 0.6823 - 1042s/epoch - 12ms/sample
Epoch 6/64
88676/88676 - 1042s - loss: 0.6595 - val_loss: 0.6816 - 1042s/epoch - 12ms/sample
Epoch 7/64
88676/88676 - 1042s - loss: 0.6371 - val_loss: 0.6364 - 1042s/epoch - 12ms/sample
Epoch 8/64
88676/88676 - 1043s - loss: 0.6158 - val_loss: 0.6219 - 1043s/epoch - 12ms/sample
Epoch 9/64
88676/88676 - 1043s - loss: 0.5997 - val_loss: 0.6170 - 1043s/epoch - 12ms/sample
Epoch 10/64


## Prediction

In [None]:
issue_1_train = train['text_emb_1']
issue_1_train = np.array(issue_1_train.values.tolist())

issue_2_train = train['text_emb_2']
issue_2_train = np.array(issue_2_train.values.tolist())

results_train = class_model.predict([issue_1_train, issue_2_train])

In [None]:
issue_1_test = test['text_emb_1']
issue_1_test = np.array(issue_1_test.values.tolist())

issue_2_test = test['text_emb_2']
issue_2_test = np.array(issue_2_test.values.tolist())

results_test = class_model.predict([issue_1_test, issue_2_test])

In [None]:
np.save(file='embeddings/train_embed_LT_'+SOURCE+'_DCCNN.npy', arr=results_train)
np.save(file='embeddings/test_embed_LT_'+SOURCE+'_DCCNN.npy', arr=results_test)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=25, random_state=0)

clf.fit(results_train, train['label'])

In [None]:
probs = clf.predict_proba(results_test)
preds = clf.predict(results_test)

test['preds_SVM'] = preds

In [None]:
train[['name', 'linktype', 'issue_id_1', 'issue_id_2', 'mappedtype', 'label']].to_csv('embeddings/train_df_LT'+SOURCE+'_DCCNN.csv')
test[['name', 'linktype', 'issue_id_1', 'issue_id_2', 'mappedtype', 'label', 'preds_SVM']].to_csv('embeddings/test_df_LT'+SOURCE+'_DCCNN.csv')

In [None]:
class_rep = classification_report(test['label'], test['preds_SVM'], output_dict=True, target_names=category_id_df.mappedtype.to_list())
class_rep_df = pd.DataFrame(class_rep).transpose()
print(class_rep_df)

conf_mat = confusion_matrix(test['preds_SVM'], test['label'])
conf_mat_df = pd.DataFrame(conf_mat).transpose()
conf_mat_df.rename(index=id_to_category, inplace=True)
conf_mat_df.rename(columns=id_to_category, inplace=True)
print(conf_mat_df)

class_rep_df.to_csv('results/class_rep_LT_'+SOURCE+"_DCCNN.csv")
conf_mat_df.to_csv('results/conf_mat_LT_'+SOURCE+"_DCCNN.csv")