### Author: Satwik Ram K

### Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path

### Loading Dataset

In [2]:
path = Path(r"/mnt/ml-fileshare/GitHub/dewatson/data/input/consolidated_data_20211012.xlsx")

In [3]:
dataset = pd.read_excel(path, engine = "openpyxl", usecols = ["Clean Description", "DO Type", "Priority"])

In [4]:
dataset.columns

Index(['DO Type', 'Priority', 'Clean Description'], dtype='object')

In [5]:
dataset.head(3)

Unnamed: 0,DO Type,Priority,Clean Description
0,Obligation,Moderate,"In performing the Services, IBM shall follow t..."
1,Obligation,Moderate,"In performing the Services, IBM shall follow t..."
2,Obligation,Moderate,"In performing the Services, IBM shall follow t..."


### Dropping Missing Values

In [6]:
dataset.dropna(inplace = True)

### Cleaning up the Priority labels

In [7]:
def priority(x):

    if x in ["3 - Moderate", "moderate"]:
        return "Moderate"

    elif x in ["2 - High", "2-High", "high"]:
        return "High"
    
    elif x in ["4 - Low", "low"]:
        return "Low"

    elif x in ["1 - Critical", "critical"]:
        return "Critical"

    else: return x

In [8]:
dataset["Priority"] =  dataset["Priority"].apply(lambda x: priority(x))

In [9]:
dataset["Priority"].unique()

array(['Moderate', 'High', 'Low', 'Critical', 'TBD', 'All'], dtype=object)

### Taking Each Class samples

In [10]:
d = dataset[dataset["DO Type"] == "Deliverable"].sample(frac = 0.2, random_state = 5).head(5)
o = dataset[dataset["DO Type"] == "Obligation"].sample(frac = 0.2, random_state = 5).head(5)
n = dataset[dataset["DO Type"] == "Neither"].sample(frac = 1, random_state = 5).head(5)
cd = dataset[dataset["DO Type"] == "Critical Deliverable"].sample(frac = 0.2, random_state = 5).head(5)
p1 = dataset[dataset["Priority"] == "High"].sample(frac = 0.2, random_state = 5).head(5)
p2 = dataset[dataset["Priority"] == "Moderate"].sample(frac = 0.2, random_state = 5).head(5)
p3 = dataset[dataset["Priority"] == "Low"].sample(frac = 0.2, random_state = 5).head(5)

### Combining the dataset

In [11]:
data = pd.concat([d, o, n, cd, p1, p2, p3])

In [12]:
data["DO Type"].unique()

array(['Deliverable', 'Obligation', 'Critical Deliverable'], dtype=object)

### Taking X, Y1, and Y2

In [13]:
X = data["Clean Description"]
y1 = data["DO Type"]
y2 = data["Priority"]

### Tokenization

In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
vocab_size = 10000
embedding_dim = 16
max_length = 150

In [16]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = "<oov>")

In [17]:
tokenizer.fit_on_texts(X)

In [18]:
X = tokenizer.texts_to_sequences(X)

In [19]:
X = pad_sequences(X, maxlen = max_length, truncating = "post")

### Label Encoding the targets

In [20]:
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()

In [21]:
y1 = le1.fit_transform(y1)
y2 = le2.fit_transform(y2)

In [22]:
le1.classes_

array(['Critical Deliverable', 'Deliverable', 'Obligation'], dtype=object)

In [23]:
le2.classes_

array(['Critical', 'High', 'Low', 'Moderate'], dtype=object)

### Building the model

In [24]:
def build_do_type(inputs):     
    
    x = tf.keras.layers.Embedding(10000, 16, input_length = 150)(inputs)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    x = tf.keras.layers.Dense(512, activation = "relu")(x)

    x = tf.keras.layers.Dense(3, activation = "softmax", name = "do_outputs")(x)

    return x

def build_priority(inputs):

    x = tf.keras.layers.Embedding(10000, 16, input_length = 150)(inputs)

    x = tf.keras.layers.GlobalAveragePooling1D()(x)

    x = tf.keras.layers.Dense(512, activation = "relu")(x)

    x = tf.keras.layers.Dense(4, activation = "softmax", name = "priority_outputs")(x)

    return x


In [25]:
inputs = tf.keras.Input(shape = (150,), dtype='int32')

model1 = build_do_type(inputs)
model2 = build_priority(inputs)

### Merging two models into single!

In [26]:
model = tf.keras.Model(
    inputs = inputs,
    outputs = [model1, model2],
    name = "multi-output")

### Model Summary

In [27]:
model.summary()

Model: "multi-output"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 150)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 150, 16)      160000      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 16)      160000      input_1[0][0]                    
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 16)           0           embedding[0][0]                  
_______________________________________________________________________________________

### Defining the loss functions

In [28]:
losses = {
	"do_outputs": "sparse_categorical_crossentropy",
	"priority_outputs": "sparse_categorical_crossentropy",
}

### Model Compiling

In [29]:
model.compile(optimizer = "adam", loss = losses, metrics = ["accuracy"])

In [30]:
y2, y1

(array([3, 3, 1, 3, 1, 3, 1, 3, 3, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3, 3,
        3, 3, 3, 2, 2, 2, 2, 2]),
 array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 1, 2, 2]))

### Model Training

In [31]:
history =  model.fit(x = X, y = {"do_outputs": y1, "priority_outputs": y2}, epochs = 20, verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Model Prediction

In [32]:
y_pred = model.predict(X)

In [33]:
do_output = y_pred[0]
priority_output = y_pred[1]

In [34]:
do_output = np.argmax(do_output, axis = -1)
priority_output = np.argmax(priority_output, axis = -1)

In [35]:
do_output, y1

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]),
 array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 1, 2, 2]))

In [36]:
priority_output, y2

(array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3]),
 array([3, 3, 1, 3, 1, 3, 1, 3, 3, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 3, 3,
        3, 3, 3, 2, 2, 2, 2, 2]))

### Classification Report

In [37]:
from sklearn.metrics import classification_report

In [38]:
print(classification_report(y1, do_output))

print(classification_report(y2, priority_output))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.50      1.00      0.67        15
           2       0.00      0.00      0.00        10

    accuracy                           0.50        30
   macro avg       0.17      0.33      0.22        30
weighted avg       0.25      0.50      0.33        30

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         5
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         5
           3       0.37      1.00      0.54        11

    accuracy                           0.37        30
   macro avg       0.09      0.25      0.13        30
weighted avg       0.13      0.37      0.20        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
