In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from multiprocessing import Pool
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection  import train_test_split
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.svm import SVC
import string
import pickle
from tqdm import tqdm_notebook

In [4]:
x = pd.read_csv('data/x.csv')
x = x['utterance']
y = pd.read_csv('data/y.csv')

##### X

In [5]:
words = ''.join(x).lower().split()

#words = [ word.lower() for word in words if len(word)>1 and word.isalpha()]
vocab = set(words)
word2int = dict((c,i) for i,c in enumerate(vocab))
int2word = dict((i,c) for i,c in enumerate(vocab))

#word2vector
x_vec = []
for u in tqdm_notebook(x):
    vec = []
    for w in u.split():
        try:
            vec.append(word2int[w.lower()])
        except:
            continue
    x_vec.append(vec)

n = None
x_vec = pad_sequences(x_vec, maxlen=n)

In [20]:
m = len(x_vec)
d = 5 #
embedding_layer = layers.Embedding(np.max(x_vec)+1, d)
x_embed = embedding_layer(x_vec)
x_embed.shape

TensorShape([10020, 869, 5])

##### y

In [58]:
y_ = y[['O', 'CQ', 'PA', 'OQ', 'PF', 'FD', 'GG', 'IR', 'RQ', 'NF', 'FQ', 'JK']]
y_.values

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [59]:
x_train, x_val, y_train, y_val = train_test_split(x_embed.numpy(), y_.values, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.11)

In [94]:
def generate_model(n_filter = 20,n = 869, f=5, d=5, f_p=2):
    inputs = keras.Input(shape=(n,d), name='digits')
    x = tf.reshape(inputs, [-1, n, d, 1]) 
    x = layers.Conv2D(n_filter, [f,d], name='conv_1')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu", name="relu_1")(x)
    x = layers.MaxPool2D(pool_size=(2,1), name="pool_1")(x)
    x = layers.Conv2D(n_filter, [f,1], name='conv_2')(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu", name="relu_2")(x)
    x = layers.MaxPool2D(pool_size=((n+1-f)/2+1-f,1), name="pool_2")(x)
    x = layers.Flatten()(x)
    outputs = layers.Dense(12, activation='sigmoid', name='dense_1')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

model = generate_model()
# Instantiate an optimizer.
optimizer = keras.optimizers.Adam()
# Instantiate a loss function.
loss_fn = keras.losses.CategoricalHinge()
# Prepare the training dataset.
batch_size = 20
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size)

In [95]:
model.summary()

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
digits (InputLayer)          [(None, 869, 5)]          0         
_________________________________________________________________
tf_op_layer_Reshape_15 (Tens [(None, 869, 5, 1)]       0         
_________________________________________________________________
conv_1 (Conv2D)              (None, 865, 1, 20)        520       
_________________________________________________________________
batch_normalization (BatchNo (None, 865, 1, 20)        80        
_________________________________________________________________
relu_1 (Activation)          (None, 865, 1, 20)        0         
_________________________________________________________________
pool_1 (MaxPooling2D)        (None, 432, 1, 20)        0         
_________________________________________________________________
conv_2 (Conv2D)              (None, 428, 1, 20)        202

In [97]:
epochs = 40
test_acc = []

inter_mediate_stats = []

for epoch in range(epochs):
    print('Start of epoch %d' % (epoch,))

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):

    # Open a GradientTape to record the operations run
    # during the forward pass, which enables autodifferentiation.
        with tf.GradientTape() as tape:

            # Run the forward pass of the layer.
            # The operations that the layer applies
            # to its inputs are going to be recorded
            # on the GradientTape.
            logits = model(x_batch_train, training=True)  # Logits for this minibatch
            
            # Compute the loss value for this minibatch.
            # print(y_batch_train.shape, logits.shape)
            loss_value = loss_fn(y_batch_train, logits,)
        
        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss.
        grads = tape.gradient(loss_value, model.trainable_weights)

        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
    
        # Log every 100 batches.
        if step % 50 == 0:
            #print('Training loss (for one batch) at step %s: %s' % (step, float(loss_value)))
            #print('Seen so far: %s samples' % ((step + 1) * batch_size))

            logits_test = model(x_test, training=False)
            loss_test = loss_fn(y_test, logits_test)
            m = tf.keras.metrics.CategoricalAccuracy()
            _ = m.update_state(y_test, logits_test)
            test_acc.append(float(m.result().numpy()))

            print('Test accuracy at step %s: %s' % (step, float(m.result().numpy())))


Start of epoch 0
Test accuracy at step 0: 0.22379031777381897
Test accuracy at step 50: 0.2177419364452362
Test accuracy at step 100: 0.2016129046678543
Test accuracy at step 150: 0.21169355511665344
Test accuracy at step 200: 0.22379031777381897
Test accuracy at step 250: 0.2036290317773819
Test accuracy at step 300: 0.20967741310596466
Test accuracy at step 350: 0.2167338728904724
Test accuracy at step 400: 0.24596774578094482
Start of epoch 1
Test accuracy at step 0: 0.24596774578094482
Test accuracy at step 50: 0.2338709682226181
Test accuracy at step 100: 0.22278225421905518
Test accuracy at step 150: 0.2479838728904724
Test accuracy at step 200: 0.06552419066429138
Test accuracy at step 250: 0.23084677755832672
Test accuracy at step 300: 0.20564515888690948
Test accuracy at step 350: 0.2489919364452362
Test accuracy at step 400: 0.19556452333927155
Start of epoch 2
Test accuracy at step 0: 0.1895161271095276
Test accuracy at step 50: 0.2530241906642914
Test accuracy at step 100: 

Test accuracy at step 400: 0.1713709682226181
Start of epoch 19
Test accuracy at step 0: 0.1723790317773819
Test accuracy at step 50: 0.25806450843811035
Test accuracy at step 100: 0.2510080635547638
Test accuracy at step 150: 0.25907257199287415
Test accuracy at step 200: 0.2510080635547638
Test accuracy at step 250: 0.24596774578094482
Test accuracy at step 300: 0.24697580933570862
Test accuracy at step 350: 0.22479838132858276
Test accuracy at step 400: 0.25705644488334656
Start of epoch 20
Test accuracy at step 0: 0.25705644488334656
Test accuracy at step 50: 0.24596774578094482
Test accuracy at step 100: 0.2348790317773819
Test accuracy at step 150: 0.2479838728904724
Test accuracy at step 200: 0.21875
Test accuracy at step 250: 0.22883065044879913
Test accuracy at step 300: 0.2328629046678543
Test accuracy at step 350: 0.2358870953321457
Test accuracy at step 400: 0.20866934955120087
Start of epoch 21
Test accuracy at step 0: 0.20564515888690948
Test accuracy at step 50: 0.212701

Test accuracy at step 400: 0.2479838728904724
Start of epoch 38
Test accuracy at step 0: 0.2479838728904724
Test accuracy at step 50: 0.2479838728904724
Test accuracy at step 100: 0.23891128599643707
Test accuracy at step 150: 0.23991934955120087
Test accuracy at step 200: 0.2510080635547638
Test accuracy at step 250: 0.24495968222618103
Test accuracy at step 300: 0.2489919364452362
Test accuracy at step 350: 0.2489919364452362
Test accuracy at step 400: 0.2489919364452362
Start of epoch 39
Test accuracy at step 0: 0.2489919364452362
Test accuracy at step 50: 0.24495968222618103
Test accuracy at step 100: 0.24294355511665344
Test accuracy at step 150: 0.24596774578094482
Test accuracy at step 200: 0.25
Test accuracy at step 250: 0.24495968222618103
Test accuracy at step 300: 0.24596774578094482
Test accuracy at step 350: 0.2530241906642914
Test accuracy at step 400: 0.2479838728904724


In [103]:
logits_test

<tf.Tensor: shape=(992, 12), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>