In [2]:
!pip install pymorphy2

Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[?25l[K     |██████                          | 10 kB 17.0 MB/s eta 0:00:01[K     |███████████▉                    | 20 kB 11.5 MB/s eta 0:00:01[K     |█████████████████▊              | 30 kB 8.3 MB/s eta 0:00:01[K     |███████████████████████▋        | 40 kB 4.0 MB/s eta 0:00:01[K     |█████████████████████████████▌  | 51 kB 4.4 MB/s eta 0:00:01[K     |████████████████████████████████| 55 kB 2.1 MB/s 
Collecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 4.4 MB/s 
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import plot_roc_curve

import re
# import pymorphy2
from collections import Counter
from wordcloud import WordCloud
from tqdm import tqdm

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
real_data = pd.read_excel('/content/comments2.xlsx', header=1, comment='#', index_col=0, names=['text', 'Toxic'])
real_data.head()

Unnamed: 0,text,Toxic
0,Путин мpazь продажная и россияне это знают,1
1,в принципе всё та же частушка для контингента ...,0
2,"в итоге парад без Герасимова, без авиации, без...",0
3,Сейчас начнется хрюканье от либерах и хохлов.,1
4,"Гибель - горе, война - плохо, но войну начать ...",1


In [6]:
TOKEN_RE = re.compile(r'[а-яё]+')
russian_stopwords = stopwords.words("russian")
lemmatizer = pymorphy2.MorphAnalyzer()

def tokenize_text(txt, min_lenght_token=2):
    txt = str(txt)
    txt = txt.lower()
    all_tokens = TOKEN_RE.findall(txt)
    return [token for token in all_tokens if len(token) >= min_lenght_token]

def remove_stopwords(tokens):
    return list(filter(lambda token: token not in russian_stopwords, tokens))

def lemmatizing(tokens):
    return [lemmatizer.parse(token)[0].normal_form for token in tokens]

def text_cleaning(txt):
    tokens = tokenize_text(txt)
    tokens  = lemmatizing(tokens)
    tokens = remove_stopwords(tokens)
    return ' '.join(tokens)

In [7]:
tqdm.pandas()

df_token = real_data.copy()
df_token['text'] = df_token['text'].progress_apply(text_cleaning)
df_token

100%|██████████| 510/510 [00:01<00:00, 315.66it/s]


Unnamed: 0,text,Toxic
0,путин продажный россиянин это знать,1
1,принцип всё частушка контингент всё плохой всё...,0
2,итог парад герасимов авиация объявление мобили...,0
3,начаться хрюканье либер хохлов,1
4,гибель гора война плохо война начать посылать ...,1
...,...,...
857,отлично всё против янки герой,0
858,сколько ещё кривой сляпать продукт толкать пом...,1
859,найти ещё фото фронтовик кстати род донбасс па...,0
860,ещё семейный пара додуматься разместить,0


In [8]:
df = df_token.copy()
empty = df[df['text'] == '']
print('Number of empty texts: ', len(empty))
df = df.drop(empty.index)

Number of empty texts:  2


In [9]:
print('Number of duplicates:', df.duplicated().sum())

Number of duplicates: 1


In [10]:
df = df.drop_duplicates()

In [11]:
comment_duplicated = df[df['text'].duplicated('last')]

# remove duplicate comments 
df = df.drop_duplicates(subset='text')
    
print('Number of duplicates:', df.duplicated('text').sum()) 

Number of duplicates: 0


In [12]:
text_tomod = df.copy()
text_tomod.head()

Unnamed: 0,text,Toxic
0,путин продажный россиянин это знать,1
1,принцип всё частушка контингент всё плохой всё...,0
2,итог парад герасимов авиация объявление мобили...,0
3,начаться хрюканье либер хохлов,1
4,гибель гора война плохо война начать посылать ...,1


In [13]:
text_tomod.reset_index(drop=True, inplace=True)

In [27]:
text_tomod.to_csv('/content/drive/MyDrive/test_data.csv', index=False)

In [14]:
!pip3 install tensorflow_text>=2.0.0rc0

In [15]:
import tensorflow_hub as hub
import tensorflow_text
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout, Concatenate
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

#USE-CNN

In [40]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3"
embed = hub.KerasLayer(module_url, trainable=True, name='MUSE_embedding')

In [41]:
def build_model(embed):

  model = Sequential([
                      Input(shape=[], dtype=tf.string),
                      embed,
                      Dense(1, activation='sigmoid')
  ])
  model.compile(Adam(2e-5), loss='binary_crossentropy', metrics=['accuracy'])
  
  return model

In [42]:
model = build_model(embed)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 MUSE_embedding (KerasLayer)  (None, 512)              68927232  
                                                                 
 dense (Dense)               (None, 1)                 513       
                                                                 
Total params: 68,927,745
Trainable params: 68,927,745
Non-trainable params: 0
_________________________________________________________________


In [43]:
model.load_weights('/content/drive/MyDrive/models_best/model.h5')

In [141]:
test = text_tomod
test_pred = model.predict(test['text'].values)
test['pred_use'] = test_pred.round().astype(int)

In [142]:
print(classification_report(test['Toxic'].values, test['pred_use'].values))

              precision    recall  f1-score   support

           0       0.81      0.73      0.77       298
           1       0.67      0.75      0.71       209

    accuracy                           0.74       507
   macro avg       0.74      0.74      0.74       507
weighted avg       0.75      0.74      0.74       507



#USE-T

In [16]:
module_url_trans = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3'
embed_trans = hub.KerasLayer(module_url_trans, trainable=True, name='MUSE_embedding')

In [17]:
def build_model_tr(embed):

  model = Sequential([
                      Input(shape=[], dtype=tf.string),
                      embed,
                      Dense(1, activation='sigmoid')
  ])
  model.compile(Adam(2e-5), loss='binary_crossentropy', metrics=['accuracy'])
  
  return model

In [18]:
model_trans = build_model_tr(embed_trans)
model_trans.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 MUSE_embedding (KerasLayer)  (None, 512)              85213184  
                                                                 
 dense (Dense)               (None, 1)                 513       
                                                                 
Total params: 85,213,697
Trainable params: 85,213,697
Non-trainable params: 0
_________________________________________________________________


In [20]:
test_tr = text_tomod

In [21]:
model_trans.load_weights('/content/drive/MyDrive/models_best/model_trans.h5')
test_pred_trans = model_trans.predict(test_tr['text'].values)

In [22]:
test_tr['pred_trans'] = test_pred_trans.round().astype(int)

In [23]:
test_tr[70:80]

Unnamed: 0,text,Toxic,pred_trans
70,самолёт судный день смочь поднять небо москва ...,1,1
71,кстати понять парад делать шойгу независимый а...,1,1
72,поздравлять мирный небо пение птица аромат цве...,0,0
73,слава советский воин позор русский солдат опук...,1,1
74,кринж повод нато навалить программа минимум вы...,0,0
75,всё помнить это путин главный патриот всий рус...,0,1
76,ципсо обосраться мобилизация рофлан бало,1,1
77,право,0,0
78,хороший делать новый успех год обсасывать побе...,1,1
79,либерахи лентач ваш мобилизация фейк который в...,1,1


In [24]:
print(classification_report(test_tr['Toxic'].values, test_tr['pred_trans'].values))

              precision    recall  f1-score   support

           0       0.82      0.74      0.78       298
           1       0.67      0.76      0.71       209

    accuracy                           0.75       507
   macro avg       0.74      0.75      0.75       507
weighted avg       0.76      0.75      0.75       507



#BERT

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.1-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 25.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 58.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.6.0 p

In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from transformers import AdamW, BertForSequenceClassification
import io
from sklearn.metrics import accuracy_score

In [5]:
test_data = pd.read_csv('/content/drive/MyDrive/test_data.csv')

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

# model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if device == torch.device('cpu'):
  print('Using CPU')
else:
  print('Using GPU')

Using GPU


In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in test_data['text'].values:

    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 200,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# fort = np.array([[int(i), int(not(i))] for i in pd_from['toxic']])
labels = torch.tensor(test_data['Toxic'])

# Print sentence 0, now as a list of IDs.
print('Original: ', test_data['text'][0])
print('Token IDs:', input_ids[0])

In [9]:
test_bert = TensorDataset(input_ids, attention_masks, labels)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-multilingual-cased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)
model.cuda()

In [11]:
# test_bert

batch_size = 32  

# Create the DataLoader.
prediction_sampler = SequentialSampler(test_bert)
prediction_dataloader = DataLoader(test_bert, sampler=prediction_sampler, batch_size=batch_size)

In [12]:
PATH = '/content/drive/MyDrive/models_best/mbert'
# Prediction on test set

# print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

model.load_state_dict(torch.load(PATH))
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

    DONE.


In [13]:
predict_lab = []
for i in range(len(predictions)):
  predict_lab.append(np.argmax(predictions[i], axis=1).flatten())

In [14]:
predict_lab = np.concatenate(predict_lab)
true_labels = np.concatenate(true_labels)

In [15]:
print(classification_report(true_labels, predict_lab))

              precision    recall  f1-score   support

           0       0.74      0.82      0.78       298
           1       0.69      0.58      0.63       209

    accuracy                           0.72       507
   macro avg       0.72      0.70      0.70       507
weighted avg       0.72      0.72      0.72       507

