In [1]:
import io
import re
import sys
import json
import math
import random
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
import multiprocessing
import gensim

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import activations

from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split



In [2]:
PATH = "/content/drive/MyDrive/Colab Notebooks/ХакМэрМосквы2023/"

In [3]:
classes = {
    "__label__NORMAL" :     "Нормальный",
    "__label__INSULT" :     "Оскорбление",
    "__label__THREAT" :     "Угроза",
    "__label__OBSCENITY" :  "Непристойность",
}

classes_key = dict([(key, it) for it, key in enumerate(classes.keys())])


In [4]:
classes_key

{'__label__NORMAL': 0,
 '__label__INSULT': 1,
 '__label__THREAT': 2,
 '__label__OBSCENITY': 3}

In [5]:
word2vec = Word2Vec.load(PATH + "Models/w2v/tweets_model.w2v")


In [6]:
def split(input_text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ', outlen=50):

    input_text = input_text.lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    seq = input_text.split(split)
    elem = [i for i in seq if i and i in word2vec.wv]
    return (elem + ["" for _ in range(outlen - len(elem))])[:outlen]


def vectorizator(x, outlen=50):
    return np.array([split(elem) for elem in x], dtype=np.str_)


def embedding(x, maxlen=50):
    result = np.zeros(shape=[x.shape[0], maxlen, word2vec.wv.vector_size], dtype=np.float32)

    for i, text in enumerate(x):
        
        index = 0
        for word in text:

            if word in word2vec.wv:
                result[i, index, :] = word2vec.wv[word]
                index += 1

    return result

In [57]:

unit = 64

inp = layers.Input(shape=(50, 200))

x = layers.Bidirectional(layers.LSTM(unit))(inp)
out = layers.Dense(4, activation=activations.sigmoid)(x)

model = keras.Model(inputs=inp, outputs=out)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=PATH + "/tmp/checkpoint/Модерация/",
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)

model.compile(optimizer=optimizers.Adam(), loss=losses.binary_crossentropy, metrics=['accuracy'])

model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 50, 200)]         0         
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              135680    
 nal)                                                            
                                                                 
 dense_4 (Dense)             (None, 4)                 516       
                                                                 
Total params: 136,196
Trainable params: 136,196
Non-trainable params: 0
_________________________________________________________________


In [58]:
# model.load_weights(model_checkpoint_callback.filepath)

In [59]:
class TextBatchGenerator():
    def __init__(self, path_to_data, batch_size) -> None:
        super().__init__()
        self.chunks = pd.read_csv(path_to_data, delimiter=',', chunksize=batch_size)

    def __call__(self):
        for chunk in self.chunks:
            chunk = self.chunks.get_chunk()
            X_batch = chunk.iloc[:, 0]
            X_batch = vectorizator(X_batch)
            X_batch = embedding(X_batch)

            y_batch = chunk.iloc[:, 1:]
            y_batch = np.array(y_batch)

            yield X_batch, y_batch


In [60]:


dataset = TextBatchGenerator(PATH + "toxic.csv", batch_size=50)()

history = model.fit(dataset, epochs=10, callbacks=[model_checkpoint_callback], steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
X_test = []

for data in dataset:
     X_test += data

In [None]:
model.evaluate(X_test[0], X_test[1])

In [None]:
history.history

In [None]:

fig, ax = plt.subplots()

plt.subplot(1, 2, 1)
plt.title("Функция потерь")
plt.xlabel("Эпоха обучения")
plt.ylabel("Значение")
plt.plot(history.history["loss"], label="Функция потерь")
plt.subplot(1, 2, 2)
plt.title("Точность")
plt.xlabel("Эпоха обучения")
plt.ylabel("Значение")
plt.plot(history.history["accuracy"], label="Точность")

fig.set_figwidth(12)
fig.set_figheight(3) 


In [None]:
texts1 = [
    "Просто говно параша какое-то Ужас!!! Верните деньги, сервис просто невыносим.", 
    "Плохой постамат, сотрудники совершенно не спсобны выполнять свою работу. Расположение не удобное, сложно найти вход.",
    "Очень хороший сервис, приятные сотрудники. Всё быстро и качественно.",
    "Замечательный отель. Наивысшая оценка. Просто поражает как сотрудники справляются со своей работой"
]
labels1 = [0, 1, 1, 1]

texts1 = np.array(texts1)
labels1 = np.array(labels1)

In [None]:

def predict(texts):

    vectors = vectorizator(texts)
    wvectors = embedding(vectors)
    predicted = model.predict(wvectors, verbose=0)

    return predicted


predicted = predict(texts1)
pd.DataFrame(np.column_stack([texts1, predicted, labels1]))

In [None]:
vectors = embedding(vectorizator(texts1))
predicted = model.predict(vectors, verbose=0)
pd.DataFrame(np.column_stack([texts1, predicted, labels1]))

In [None]:
np.mean(predicted[:len(predicted) // 2]), np.mean(predicted[len(predicted) // 2:])