## Import Libraries

In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing import text, sequence

Using TensorFlow backend.


## Reading Trace Matrix - EasyClinic Project

In [2]:
trace_df = pd.read_csv('../data/EasyClinic/EasyClinicDataset/oracle/output/trace_matrix.csv')

trace_df.head()

Unnamed: 0,artf_1,artf_2,link
0,CC_114_SRC,CC_123_TRG,0
1,CC_114_SRC,CC_124_TRG,0
2,CC_114_SRC,CC_122_TRG,0
3,CC_114_SRC,CC_126_TRG,0
4,CC_114_SRC,CC_127_TRG,0


In [3]:
counts = trace_df.link.value_counts()
no_link_count = counts[0]
link_count = counts[1]
total = no_link_count + link_count

no_link_prop = no_link_count / total * 100
link_prop = link_count / total * 100

print('No Links Amount: {}'.format(no_link_count))
print('Links Amount: {}'.format(link_count))
print('----')
print('Total Combinations: {}'.format(total))
print('----')
print('No Link Prop: {:2.3}%'.format(no_link_prop))
print('Link Prop: {:2.3}%'.format(link_prop))

No Links Amount: 23528
Links Amount: 952
----
Total Combinations: 24480
----
No Link Prop: 96.1%
Link Prop: 3.89%


## Reading Artifacts Description Dataframe

In [4]:
artifacts_df = pd.read_csv('../data/EasyClinic/EasyClinicDataset/oracle/output/artifacts_descriptions.csv')

print("Artifacts Dataframe Shape: " + str(artifacts_df.shape))

artifacts_df.head()

Artifacts Dataframe Shape: (313, 2)


Unnamed: 0,artf_name,artf_description
0,CC_114_SRC,Class GUILogin Date: 18/09/2003 \r\n ...
1,CC_115_SRC,Class GUIPrincipale Date: 18/09/2003 \r\n ...
2,CC_116_SRC,Class GUIAnagrafica Date: 18/09/2003 \r\n ...
3,CC_117_SRC,Class GUIPrenotazioni Date: 18/09/2003 \r\n ...
4,CC_118_SRC,Class GUICartellaClinica Date: 18/09/2003 \n ...


## Preprocess Text to Create Word Embedding

In [6]:
list_sentences_train = artifacts_df["artf_description"].fillna("Invalid").values
list_sentences_train.shape

(313,)

In [15]:
MAX_FEATURES = 20000
MAX_LEN = max([len(s.split()) for s in list(list_sentences_train)])

In [16]:
tokenizer = text.Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(list_sentences_train))

word_index = tokenizer.word_index 
print('Found %s unique tokens.' % len(word_index))

# train data
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=MAX_LEN)
X_t.shape

Found 1326 unique tokens.


(313, 698)

## Model

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 400, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())