In [1]:
import os
import torch
import pandas as pd
from scipy import stats
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import OrderedDict, namedtuple
import torch.nn as nn
from torch.optim import lr_scheduler
import joblib

import logging
import transformers
import sys

In [2]:
class BERTBaseUncased(nn.Module):
    def __init__(self, bert_path):
        super(BERTBaseUncased, self).__init__()
        self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained(self.bert_path)
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768 * 2, 1)

    def forward(
            self,
            ids,
            mask,
            token_type_ids
    ):
        o1, o2 = self.bert(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids)
        
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)

        bo = self.bert_drop(cat)
        p2 = self.out(bo)
        return p2


class BERTDatasetTest:
    def __init__(self, comment_text, tokenizer, max_length):
        self.comment_text = comment_text
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, item):
        comment_text = str(self.comment_text[item])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]
        
        padding_length = self.max_length - len(ids)
        
        ids = ids + ([0] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long)
        }

In [3]:
df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/test.csv")
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-multilingual-uncased/", 
                                                       do_lower_case=True)

In [4]:
device = "cuda"
model = BERTBaseUncased(bert_path="../input/bert-base-multilingual-uncased/").to(device)
model.load_state_dict(torch.load("../input/fork-of-toxicity-classification-v1-2/model.bin"))
model.eval()

BERTBaseUncased(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

In [5]:
list(df['lang'].unique())

['tr', 'ru', 'it', 'fr', 'pt', 'es']

In [6]:
valid_dataset = BERTDatasetTest(
        comment_text=df.content.values,
        tokenizer=tokenizer,
        max_length=192
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

In [7]:
with torch.no_grad():
    fin_outputs = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.cpu().detach().numpy().tolist()
        fin_outputs.extend(outputs_np)

998it [05:56,  2.80it/s]


In [8]:
fin_outputs

[[-3.3914706707000732],
 [-5.55560302734375],
 [-0.8107063174247742],
 [-3.498920202255249],
 [-4.61940336227417],
 [-1.6866639852523804],
 [-4.059052467346191],
 [-1.4683949947357178],
 [-2.4803924560546875],
 [-1.711912989616394],
 [-2.9460153579711914],
 [-0.9554620385169983],
 [1.0012952089309692],
 [-4.222466468811035],
 [0.18489982187747955],
 [-3.565481662750244],
 [-4.097567081451416],
 [-4.347966194152832],
 [-5.379560947418213],
 [-2.9811761379241943],
 [-5.305912017822266],
 [-2.7661995887756348],
 [-3.045413017272949],
 [-2.624285936355591],
 [-4.634127616882324],
 [-4.33123254776001],
 [-4.617262840270996],
 [-3.5793216228485107],
 [-2.77303147315979],
 [-1.9442131519317627],
 [-0.8568552136421204],
 [-4.5941925048828125],
 [-0.8022053241729736],
 [-3.3016726970672607],
 [-5.271793842315674],
 [-4.852114677429199],
 [-4.512022972106934],
 [-4.603598594665527],
 [-2.2468721866607666],
 [-4.4427947998046875],
 [-4.272116184234619],
 [-5.42038631439209],
 [-2.4707581996917725

In [9]:
df_en = pd.read_csv("../input/modelv2/test_en.csv")

valid_dataset = BERTDatasetTest(
        comment_text=df_en.content_en.values,
        tokenizer=tokenizer,
        max_length=192
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

with torch.no_grad():
    fin_outputs_en = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.cpu().detach().numpy().tolist()
        fin_outputs_en.extend(outputs_np)

998it [05:55,  2.81it/s]


In [10]:
fin_outputs_en

[[-5.95382833480835],
 [-6.134910583496094],
 [-2.7671573162078857],
 [-6.572113513946533],
 [-6.291626453399658],
 [2.0949394702911377],
 [-6.578081130981445],
 [-3.2942469120025635],
 [-3.1992549896240234],
 [-3.8727521896362305],
 [-6.054502964019775],
 [-0.4639870822429657],
 [1.1872071027755737],
 [-1.3334778547286987],
 [0.13673196732997894],
 [-3.856968879699707],
 [-4.499971866607666],
 [-5.507228374481201],
 [2.378264904022217],
 [-3.451974630355835],
 [-6.543571949005127],
 [-3.440572738647461],
 [-4.148654460906982],
 [-2.816128969192505],
 [-6.107646465301514],
 [-0.20303723216056824],
 [-6.4245195388793945],
 [-5.136613845825195],
 [-3.947748899459839],
 [-3.7396628856658936],
 [-0.39461860060691833],
 [-6.657824993133545],
 [-2.1727294921875],
 [-4.291390895843506],
 [-6.242150783538818],
 [-6.452244281768799],
 [-6.36177396774292],
 [-6.358073711395264],
 [-1.6644490957260132],
 [-6.1265482902526855],
 [-5.835404872894287],
 [-6.462325096130371],
 [-1.6495015621185303],


In [11]:
df_en2 = pd.read_csv("../input/modelv2/jigsaw_miltilingual_test_translated.csv")

valid_dataset = BERTDatasetTest(
        comment_text=df_en2.translated.values,
        tokenizer=tokenizer,
        max_length=192
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size=64,
    drop_last=False,
    num_workers=4,
    shuffle=False
)

with torch.no_grad():
    fin_outputs_en2 = []
    for bi, d in tqdm(enumerate(valid_data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        token_type_ids = d["token_type_ids"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)

        outputs = model(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )

        outputs_np = outputs.cpu().detach().numpy().tolist()
        fin_outputs_en2.extend(outputs_np)

998it [05:55,  2.81it/s]


In [12]:
fin_outputs_en = [item for sublist in fin_outputs_en for item in sublist]
fin_outputs_en2 = [item for sublist in fin_outputs_en2 for item in sublist]
fin_outputs = [item for sublist in fin_outputs for item in sublist]

In [13]:
sample = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")
sample.loc[:, "toxic"] = (np.array(fin_outputs) + np.array(fin_outputs_en) + np.array(fin_outputs_en2)) / 3.0
sample.to_csv("submission.csv", index=False)

In [14]:
sample.head()

Unnamed: 0,id,toxic
0,0,-5.107969
1,1,-6.063661
2,2,-1.971631
3,3,-5.503793
4,4,-5.769696
