In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt

In [2]:
def preprocess(text):
    tokens = text.lower().split()
    tokens = [token.strip(punctuation) for token in tokens]
    return tokens

In [3]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
quora = pd.read_csv('quora.csv')

In [5]:
vocab = Counter()

for text in quora.question_text:
    vocab.update(preprocess(text))

In [6]:
filtered_vocab = set()

for word in vocab:
    if vocab[word] > 5:
        filtered_vocab.add(word)

In [7]:
word2id = {'UNK':1, 'PAD':0}

for word in filtered_vocab:
    word2id[word] = len(word2id)

In [8]:
id2word = {i:word for word, i in word2id.items()}

In [9]:
X = []

for text in quora.question_text:
    tokens = preprocess(text)
    ids = [word2id.get(token, 1) for token in tokens]
    X.append(ids)

In [10]:
MAX_LEN = max(len(x) for x in X)
MEAN_LEN = np.median([len(x) for x in X])
MAX_LEN, MEAN_LEN

(134, 11.0)

In [11]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)
y = quora.target.values
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, stratify=y)

In [12]:
checkpoint = \
    tf.keras.callbacks.ModelCheckpoint(
        'model.weights', monitor='val_f1', verbose=1,
        save_weights_only=True, save_best_only=True,
        mode='max', save_freq='epoch',
    )

early_stop = \
    tf.keras.callbacks.EarlyStopping(
        monitor='val_f1', min_delta=0.01, patience=3,
        verbose=1, mode='max',
    )

In [19]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))

embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=10)(inputs, )

conv1 = tf.keras.layers.Conv1D(kernel_size=10, filters=32)(embeddings)

concat = tf.keras.layers.Flatten()(conv1)
dense = tf.keras.layers.Dense(64, activation='relu')(concat)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[f1],
)

In [20]:
model.fit(
    X_train, y_train, 
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=10,
    callbacks=[checkpoint, early_stop],
)

Epoch 1/10
Epoch 00001: val_f1 did not improve from 0.61353
Epoch 2/10
Epoch 00002: val_f1 did not improve from 0.61353
Epoch 3/10
Epoch 00003: val_f1 did not improve from 0.61353
Epoch 4/10
Epoch 00004: val_f1 did not improve from 0.61353
Epoch 5/10
Epoch 00005: val_f1 did not improve from 0.61353
Epoch 00005: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f14c8fc6e80>

In [28]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))

embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=10)(inputs, )

conv1 = tf.keras.layers.Conv1D(kernel_size=10, filters=32)(embeddings)
conv2 = tf.keras.layers.Conv1D(kernel_size=1, filters=128, strides=2, activation='relu')(conv1)

concat = tf.keras.layers.Flatten()(conv2)
dense = tf.keras.layers.Dense(64, activation='relu')(concat)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[f1],
)
model.fit(
    X_train, y_train, 
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=10,
    callbacks=[checkpoint, early_stop],
)

Epoch 1/10
Epoch 00001: val_f1 did not improve from 0.61429
Epoch 2/10
Epoch 00002: val_f1 did not improve from 0.61429
Epoch 3/10
Epoch 00003: val_f1 did not improve from 0.61429
Epoch 4/10
Epoch 00004: val_f1 did not improve from 0.61429
Epoch 5/10
Epoch 00005: val_f1 did not improve from 0.61429
Epoch 6/10
Epoch 00006: val_f1 did not improve from 0.61429
Epoch 7/10
Epoch 00007: val_f1 did not improve from 0.61429
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f13c81c5780>

In [14]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))

embeddings = tf.keras.layers.Embedding(input_dim=len(word2id), output_dim=5)(inputs, )

conv1 = tf.keras.layers.Conv1D(kernel_size=10, filters=32)(embeddings)
conv2 = tf.keras.layers.Conv1D(kernel_size=1, filters=128, strides=2, activation='relu')(conv1)

concat = tf.keras.layers.Flatten()(conv2)
dense = tf.keras.layers.Dense(64, activation='relu')(concat)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=[f1],
)
model.fit(
    X_train, y_train, 
    validation_data=(X_valid, y_valid),
    batch_size=256,
    epochs=10,
    callbacks=[checkpoint, early_stop],
)

Epoch 1/10


UnknownError: 2 root error(s) found.
  (0) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model_1/conv1d_2/conv1d (defined at <ipython-input-14-a692864e4232>:24) ]]
	 [[gradient_tape/model_1/embedding_1/embedding_lookup/Reshape/_36]]
  (1) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model_1/conv1d_2/conv1d (defined at <ipython-input-14-a692864e4232>:24) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_2273]

Function call stack:
train_function -> train_function
