In [None]:
import io
import re
import sys
import math
import random
import string
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
import logging
import multiprocessing
import gensim

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras import activations
from tensorflow.keras import backend

from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer

from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split


In [None]:
PATH = "/content/drive/MyDrive/Colab Notebooks/ХакМэрМосквы2023/"

In [None]:
word2vec = Word2Vec.load(PATH + "Models/w2v/tweets_model.w2v")


In [None]:
def split(input_text, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ', outlen=50):

    input_text = input_text.lower()

    translate_dict = {c: split for c in filters}
    translate_map = str.maketrans(translate_dict)
    input_text = input_text.translate(translate_map)

    seq = input_text.split(split)
    elem = [i for i in seq if i and i in word2vec.wv]
    return (elem + ["" for _ in range(outlen - len(elem))])[:outlen]


def vectorizator(x, outlen=50):
    return np.array([split(elem) for elem in x], dtype=np.str_)


def embedding(x, maxlen=50):
    result = np.zeros(shape=[x.shape[0], maxlen, word2vec.wv.vector_size], dtype=np.float32)

    for i, text in enumerate(x):
        
        index = 0
        for word in text:

            if word in word2vec.wv:
                result[i, index, :] = word2vec.wv[word]
                index += 1

    return result

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwords):
        super(TransformerBlock, self).__init__(**kwords)

        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)

        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):

        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, embed_dim, **kwords):
        super(TokenAndPositionEmbedding, self).__init__(**kwords)

        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
        self.maxlen = maxlen

    def call(self, x):
        positions = tf.range(start=0, limit=self.maxlen, delta=1)
        return x + self.pos_emb(positions)



In [None]:

unit = 256

input = layers.Input(shape=(50, 200))

x = TokenAndPositionEmbedding(50, 200)(input)
x = TransformerBlock(200, 2, 2)(x)
x = layers.GlobalMaxPooling1D()(x)
output = layers.Dense(1, activation=activations.sigmoid)(x)

model = keras.Model(inputs=input, outputs=output)


In [None]:

model.compile(optimizer=optimizers.Adam(), loss=losses.binary_crossentropy, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 50, 200)]         0         
                                                                 
 token_and_position_embeddin  (None, 50, 200)          10000     
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 50, 200)          323202    
 merBlock)                                                       
                                                                 
 global_max_pooling1d (Globa  (None, 200)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense_2 (Dense)             (None, 1)                 201   

In [None]:


model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=PATH + "/tmp/checkpoint/Тон/",
    save_weights_only=True,
    monitor='loss',
    mode='min',
    save_best_only=True)



In [None]:
model.load_weights(model_checkpoint_callback.filepath)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7f7444ca87c0>

In [None]:
class TextBatchGenerator():
    def __init__(self, path_to_data, batch_size) -> None:
        super().__init__()
        self.chunks = pd.read_csv(path_to_data, delimiter=',', chunksize=batch_size)

    def __call__(self, X_it, y_it):
        for chunk in self.chunks:
            chunk = self.chunks.get_chunk()
            X_batch = chunk.iloc[:, X_it]
            X_batch = vectorizator(X_batch)
            X_batch = embedding(X_batch)

            y_batch = chunk.iloc[:, y_it]
            y_batch = np.array(y_batch - 1) * 0.25

            yield X_batch, y_batch


In [None]:

epochs = 30

for _ in range(epochs):
    dataset = TextBatchGenerator(PATH + "sentiment_up.csv")(0)

    model.fit(dataset, epochs=1, callbacks=[model_checkpoint_callback], steps_per_epoch=485)

ValueError: ignored

In [None]:
data1 = pd.read_csv(PATH + "reviews.csv").dropna()[:59*2]

texts1 = data1["text"].values
labels1 = data1["mark"].values


In [None]:
texts1 = [
    "Просто говно параша какое-то Ужас!!! Верните деньги, сервис просто невыносим.", 
    "Плохой постамат, сотрудники совершенно не спсобны выполнять свою работу. Расположение не удобное, сложно найти вход.",
    "Очень хороший сервис, приятные сотрудники. Всё быстро и качественно.",
    "Замечательный отель. Наивысшая оценка. Просто поражает как сотрудники справляются со своей работой"
]
labels1 = [0, 0, 1, 1]

texts1 = np.array(texts1)
labels1 = np.array(labels1)

In [None]:

def predict(texts):

    vectors = vectorizator(texts)
    wvectors = embedding(vectors)
    predicted = model.predict(wvectors, verbose=0)

    return predicted


predicted = predict(texts1)
pd.DataFrame(np.column_stack([texts1, predicted, labels1]))

In [None]:
np.mean(predicted[:len(predicted) // 2]), np.mean(predicted[len(predicted) // 2:])