In [1]:
from allennlp.commands.elmo import ElmoEmbedder
from allennlp.modules.elmo import Elmo, batch_to_ids
import gensim
from gensim import models
from gensim.models import Word2Vec, KeyedVectors
import numpy as np
import os
import pandas as pd
from razdel import tokenize, sentenize
import re
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
import string
import torch
from torch import nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm

unable to import 'smart_open.gcs', disabling that module


In [15]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001F923"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' ', text)

def clean_text(text):
    
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    
    text = text.lower()
    text = re.sub('\[.*?\]', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', ' ', text)
    text = re.sub('<.*?>+', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\d', ' ', text)
    text = re.sub(' {2,}', ' ', text)
    return text

def get_tokens(texts):
    
    '''Divide text into sentences and tokenize'''
    
    texts_tokenized = []
    for text in texts:
        for sentence in sentenize(text):
            texts_tokenized.append([token.text.lower() for token in tokenize(sentence.text) if token.text not in string.punctuation])
    return texts_tokenized

def get_sentence_embedding(model, phrase, model_name='fasttext', device='cpu'):
    
    '''Get sentence embeddings using Word2Vec, Fasttext or ELMO'''
    
    assert model_name in ['w2v', 'fasttext','elmo']
    
    if model_name == 'w2v':
        embeddings = np.array([model.get_vector(word.text.lower()) if word.text.lower() in model.vocab else np.zeros((model.vector_size,))
                               for word in tokenize(phrase)])
        return np.mean(embeddings, axis=0)
    
    if model_name == 'fasttext':
        embeddings = np.array([model.get_vector(word.text.lower()) for word in tokenize(phrase)])  
        return np.mean(embeddings, axis=0)
    
    if model_name == 'elmo':
        phrase_tokenized = [word.text.lower() for word in tokenize(phrase)]
#         embeddings = model_elmo.embed_sentence(phrase_tokenized)[2][0].reshape(1, -1)
# #        embeddings = np.mean(embeddings, axis=1)
        inds = batch_to_ids([phrase_tokenized])
        inds = inds.to(device)
        embeddings = model(inds)['elmo_representations'][0].detach().cpu().numpy()
        embeddings = np.mean(embeddings, axis=1)[0]
        return embeddings


def get_text_embeddings(texts, device='cpu', batch_size=4):
    
    '''Get embeddings using ELMO'''
    
    loader = DataLoader(list(texts), batch_size=batch_size, shuffle=False)
    
    embeddings = []
    
    for batch in tqdm(iter(loader)):
        batch_tokenized = get_tokens(batch)
        inds = batch_to_ids(batch_tokenized).to(device)
        batch_embeddings = elmo(inds)['elmo_representations'][0].detach().cpu().numpy()
        batch_embeddings = np.mean(batch_embeddings, axis=1) 
        embeddings.append(batch_embeddings)
        
    return np.vstack(embeddings)

### Choose parameters, load and preprocess dataset

In [3]:
# Parameters
dir_data = "./"
col_text = "comment" # name of the column with texts
col_target = "toxic" #target name
test_size = 0.3 # share of Test sample
random_state = 2021
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 

In [25]:
# OK toxic classification challenge dataset 
with open(os.path.join(dir_data, "okcup_train.txt"), encoding='utf8') as f:
    data = f.read().split("\n")
    
texts = []
classes = []
for x in data:
    row = x.split("\t")
    classes.append(int(" ".join(row[1:-1])!='__label__NORMAL')) # At least one class is toxic
    texts.append(row[-1])
    
df_part1 = pd.DataFrame({'comment': texts[:-1], 'toxic':classes[:-1]})

# https://www.kaggle.com/blackmoon/russian-language-toxic-comments
df_part2 = pd.read_csv(os.path.join(dir_data, "toxic_labeled.csv"))
df_part2['toxic'] = df_part2['toxic'].astype('int64')

df = pd.concat([df_part1, df_part2]).reset_index(drop=True)

# Preprocess dataset
df[col_text] = df[col_text].apply(lambda x: remove_emoji(x))
df[col_text] = df[col_text].apply(lambda x: clean_text(x))

In [6]:
df_train, df_test = train_test_split(
                                    df,
                                    test_size=test_size,
                                    random_state=random_state,
                                    stratify=df[col_target]
                                    )

In [7]:
texts_tokenized_train = get_tokens(df_train[col_text])

assert len(texts_tokenized_train[0]) > 0
assert texts_tokenized_train[0][0].islower()

### Based on W2V

In [8]:
model_w2v = Word2Vec(sentences=texts_tokenized_train, size=128, window=5, min_count=1, workers=6).wv

In [9]:
y_train = df_train[col_target].to_numpy()
X_train = []
for phrase in tqdm(df_train[col_text]):
    X_train.append(get_sentence_embedding(model_w2v, phrase, 'w2v'))
X_train = np.vstack(X_train)

y_test = df_test[col_target].to_numpy()
X_test = []
for phrase in tqdm(df_test[col_text]):
    X_test.append(get_sentence_embedding(model_w2v, phrase, 'w2v'))
X_test = np.vstack(X_test)

HBox(children=(IntProgress(value=0, max=114230), HTML(value='')))




HBox(children=(IntProgress(value=0, max=48957), HTML(value='')))




In [11]:
# Train and apply MLP
clf_mlp_w2v = MLPClassifier()
clf_mlp_w2v.fit(X_train, y_train)

y_predicted = clf_mlp_w2v.predict(X_test)

print(f'Recall: {metrics.recall_score(y_test, y_predicted)}')
print(f'Precision: {metrics.precision_score(y_test, y_predicted)}')



Recall: 0.5537040967947358
Precision: 0.7489233419465977


### Based on Fasttext

In [12]:
model_fasttext = KeyedVectors.load("./tayga_none_fasttextcbow_300_10_2019/model.model")

In [16]:
y_train = df_train[col_target].to_numpy()
X_train = []
for phrase in tqdm(df_train[col_text]):
    X_train.append(get_sentence_embedding(model_fasttext, phrase, 'fasttext'))
X_train = np.vstack(X_train)

y_test = df_test[col_target].to_numpy()
X_test = []
for phrase in tqdm(df_test[col_text]):
    X_test.append(get_sentence_embedding(model_fasttext, phrase, 'fasttext'))
X_test = np.vstack(X_test)

HBox(children=(IntProgress(value=0, max=114230), HTML(value='')))




HBox(children=(IntProgress(value=0, max=48957), HTML(value='')))




In [17]:
# Train and apply MLP
clf_mlp_fasttext = MLPClassifier()
clf_mlp_fasttext.fit(X_train, y_train)

y_predicted = clf_mlp_fasttext.predict(X_test)
print(f'Recall: {metrics.recall_score(y_test, y_predicted)}')
print(f'Precision: {metrics.precision_score(y_test, y_predicted)}')

Recall: 0.7253237104648694
Precision: 0.823869801084991


### Based on ELMO

In [10]:
#model_elmo = ElmoEmbedder(options_file="./elmo/options.json", weight_file="./elmo/model.hdf5", cuda_device=0)
#https://github.com/allenai/allennlp/issues/2245
#(charcnn, lstm1, lstm2)

In [18]:
elmo = Elmo(options_file="./elmo/options.json",
            weight_file="./elmo/model.hdf5",
            num_output_representations=1,
            requires_grad=False,
            dropout=0).to(device)

In [19]:
# Example on a subsample to save time
df = df.sample(5000)

df_train, df_test = train_test_split(
                                    df,
                                    test_size=test_size,
                                    random_state=random_state,
                                    stratify=df[col_target]
                                    )

In [21]:
X_train = get_text_embeddings(df_train[col_text], device=device, batch_size=4)
y_train = df_train[col_target].to_numpy()

X_test = get_text_embeddings(df_test[col_text], device=device, batch_size=4)
y_test = df_test[col_target].to_numpy()

HBox(children=(IntProgress(value=0, max=875), HTML(value='')))




HBox(children=(IntProgress(value=0, max=375), HTML(value='')))




In [23]:
clf_mlp_elmo = MLPClassifier()
clf_mlp_elmo.fit(X_train, y_train)

y_predicted = clf_mlp_elmo.predict(X_test)
print(f'Recall: {metrics.recall_score(y_test, y_predicted)}')
print(f'Precision: {metrics.precision_score(y_test, y_predicted)}')

Recall: 0.7052631578947368
Precision: 0.767175572519084
