In [None]:
!pip install transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from bs4 import BeautifulSoup
 
with open("/content/drive/MyDrive/Diplom/ChatExport_2021-05-02/messages.html", "r") as f:
    contents = f.read()
    soup = BeautifulSoup(contents, 'lxml')
    texts = soup.text

In [None]:
import numpy as np

result_text = []
text_list = texts.split("\n")
mes = "Not included, change data exporting settings to download."
for i in range(len(text_list)):
    text = text_list[i]
    if len(text) > 100 and text != mes:
        ind = text.find("http")
        text = text[:ind]
        result_text += [text]
result_text = np.array(result_text)

In [None]:
labels = [
"Reductio ad hitlerum",
"Whataboutism",
"Presenting Irrelevant Data (Red Herring)",
"Doubt",
"Slogans",
"Appeal to fear/prejudice",
"Obfuscation, Intentional vagueness, Confusion",
"Misrepresentation of Someone's Position (Straw Man)",
"Glittering generalities (Virtue)",
"Appeal to authority",
"Repetition",
"Bandwagon",
"Causal Oversimplification",
"Name calling/Labeling",
"Thought-terminating cliché",
"Flag-waving",
"Exaggeration/Minimisation",
"Smears",
"Loaded Language",
"Black-and-white Fallacy/Dictatorship"
]

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [None]:
def preprocessing(texts, max_len=300):
    col_input_ids = []
    col_attention_mask = []
    col_token_type_ids = []
    technique = []
    res_texts = []
    
    for i in range(texts.shape[0]):
        token_text = tokenizer.encode_plus(
            texts[i], 
            return_offsets_mapping=True,
            max_length=max_len,
            truncation=True
        )
        
        token_count = len(token_text.input_ids)
        
        for label in labels:
            token_technique = tokenizer.encode_plus(
                label, 
                return_offsets_mapping=True, 
                max_length=max_len, 
                truncation=True
            )
            input_ids = token_text.input_ids + token_technique.input_ids[1:]
            token_type_ids = [0] * token_count + [1] * len(token_technique.input_ids[1:])
            len_input_ids = len(input_ids)
            attention_mask = [1] * len_input_ids
            
            if max_len < len_input_ids:
                break

            technique.append(label)
            res_texts.append(texts[i])
            padding = [0] * (max_len - len_input_ids)
            col_input_ids.append(input_ids + padding)
            col_attention_mask.append(attention_mask + padding)
            col_token_type_ids.append(token_type_ids + padding)

    return col_input_ids, col_attention_mask, col_token_type_ids, technique, res_texts

In [None]:
import pandas as pd

col_input_ids, col_attention_mask, col_token_type_ids, technique, result_text = preprocessing(result_text, max_len=300)
data_pd = pd.DataFrame()
data_pd["text"] = result_text
data_pd["technique"] = technique
data_pd["col_input_ids"] = col_input_ids
data_pd["col_attention_mask"] = col_attention_mask
data_pd["col_token_type_ids"] = col_token_type_ids


In [None]:
def check_text(model, data, i, dev, line):
    ids = torch.tensor([list(data["col_input_ids"][i])]).to(dev)
    attention_mask = torch.tensor([list(data["col_attention_mask"][i])]).to(dev)
    type_ids = torch.tensor([list(data["col_token_type_ids"][i])]).to(dev)
    model.eval()
    with torch.no_grad():
        output = model(ids, attention_mask, type_ids)
        ans_mask = (torch.squeeze(output, dim=1)[0] > line).cpu()
        if sum(ans_mask) > 0:
            print("technique:", data["technique"][i])
            print("---")
            print("text:", data["text"][i])
            print("---")
            ans_seq_tok = np.array(data["col_input_ids"][i])[ans_mask == 1]
            print("ans:", tokenizer.decode(ans_seq_tok))

In [None]:
import torch.nn as nn
import transformers

class Model(transformers.BertPreTrainedModel):
    def __init__(self, config, PATH):
        super(Model, self).__init__(config)
        self.bert = transformers.BertModel.from_pretrained(PATH)
        self.linear = nn.Linear(768, 1)
        self.flatten = nn.Flatten()
        self.sigm = nn.Sigmoid()
    
    def forward(self, ids, mask, token_type_ids):
        embedding = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )[0]
        logits = self.linear(embedding)
        logits = self.flatten(logits)
        result = self.sigm(logits)
        return result

In [None]:
PATH = "DeepPavlov/rubert-base-cased"

bert = transformers.BertModel.from_pretrained(PATH)
model = Model(bert.config, PATH)

In [None]:
import torch

PATH = "/content/drive/MyDrive/Diplom/model2.pth"
model.load_state_dict(torch.load(PATH))

In [None]:
import torch

if torch.cuda.is_available():  
  dev = "cuda:0" 
else:  
  dev = "cpu" 

dev

In [None]:
model.to(dev)

In [None]:
data_pd

In [None]:
for i in range(10):
    check_text(model, data_pd, i, dev, 0.5)