<a href="https://colab.research.google.com/github/Vakhranev/Compling/blob/master/%D0%A1%D0%B2%D1%91%D1%80%D1%82%D0%BE%D1%87%D0%BD%D1%8B%D0%B5_%D0%BD%D0%B5%D0%B9%D1%80%D0%BE%D0%BD%D0%BD%D1%8B%D0%B5_%D1%81%D0%B5%D1%82%D0%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!unzip data.zip

Archive:  data.zip
  inflating: quora.csv               
  inflating: __MACOSX/._quora.csv    


In [8]:
!pip install pandas scikit-learn matplotlib



In [0]:
import tensorflow as tf

In [0]:
import pandas as pd
import numpy as np
from string import punctuation
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from tensorflow.keras import backend as K
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
quora = pd.read_csv('quora.csv')

In [0]:
vocab = Counter()

for sym in quora.question_text:
    sym = sym.lower()
    vocab.update(sym)

In [14]:
len(vocab)

1949

In [0]:
vocab

In [0]:
filtered_vocab = set()

for sym in vocab:
    if vocab[sym] > 5:
        filtered_vocab.add(sym)

In [17]:
len(filtered_vocab)

413

In [0]:
sym2id = {'UNK':1, 'PAD':0}

for sym in filtered_vocab:
    sym2id[sym] = len(sym2id)

In [0]:
sym2id

In [0]:
id2sym = {i:sym for sym, i in sym2id.items()}

In [0]:
id2sym

In [0]:
X = []

for text in quora.question_text:
    text = text.lower()
    ids = [sym2id.get(sym, 1) for sym in text]
    X.append(ids)

In [0]:
X

In [0]:
MAX_LEN = max(len(x) for x in X)

In [0]:
MEAN_LEN = np.median([len(x) for x in X])

In [26]:
MAX_LEN, MEAN_LEN

(1017, 60.0)

In [0]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=MAX_LEN)

In [28]:
X.shape

(1306122, 1017)

In [0]:
y = quora.target.values

In [0]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.05, stratify=y)

In [0]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.weights', 
                                                monitor='val_f1',
                                                verbose=1,
                                                save_weights_only=True,
                                                save_best_only=True,
                                                mode='max',
                                                save_freq='epoch'
                                               )

early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_f1', 
                                              min_delta=0.01,
                                              patience=3,
                                              verbose=1, 
                                              mode='max',
                                              )

In [0]:
inputs = tf.keras.layers.Input(shape=(MAX_LEN,))

embeddings = tf.keras.layers.Embedding(input_dim=len(sym2id), output_dim=100)(inputs)
drop1 = tf.keras.layers.Dropout(0.3)(embeddings)
conv1 = tf.keras.layers.Conv1D(kernel_size=3, filters=128, strides=1, 
                                               kernel_regularizer='l2',
                                              activation='relu')(embeddings)
conv2 = tf.keras.layers.Conv1D(kernel_size=5, filters=128, strides=2, 
                                               kernel_regularizer='l2',
                                              activation='relu')(conv1)

drop2 = tf.keras.layers.Dropout(0.5)(conv2)

flatten = tf.keras.layers.Flatten()(drop2)
dense = tf.keras.layers.Dense(50, activation='relu')(flatten)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=[f1])

In [0]:
model.fit(X_train, y_train, 
          validation_data=(X_valid, y_valid),
          batch_size=2000,
          epochs=10,
          callbacks=[checkpoint, early_stop])

Epoch 1/10
Epoch 00001: val_f1 improved from -inf to 0.00000, saving model to model.weights
Epoch 2/10
Epoch 00002: val_f1 improved from 0.00000 to 0.10958, saving model to model.weights
Epoch 3/10
Epoch 00003: val_f1 improved from 0.10958 to 0.20701, saving model to model.weights
Epoch 4/10
Epoch 00004: val_f1 improved from 0.20701 to 0.38433, saving model to model.weights
Epoch 5/10
Epoch 00005: val_f1 improved from 0.38433 to 0.43041, saving model to model.weights
Epoch 6/10
Epoch 00006: val_f1 did not improve from 0.43041
Epoch 7/10