<a href="https://colab.research.google.com/github/TapasKumarDutta1/multilingial/blob/master/monolingual_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install sentencepiece

In [None]:
pip install transformers

In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from statistics import mean
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision.models.resnet import resnet50, resnet18, resnet34, resnet101
from torch.optim import *
from torch.nn.modules.loss import *
from torch.optim.lr_scheduler import *
from torch.utils.data.sampler import RandomSampler
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import *
import seaborn as sns
from matplotlib import pyplot as plt
from datetime import date
import albumentations as A
from tempfile import gettempdir
from sklearn.preprocessing import LabelEncoder
import scipy as sp
import cv2
import zipfile
from pathlib import Path
import random
import argparse
import sys
import yaml
import time
import torch_xla
import torch_xla.debug.metrics as met
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.parallel_loader as pl
import torch_xla.utils.utils as xu
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.test.test_utils as test_utils
from typing import Dict
import matplotlib.pyplot as plt
from torch import nn, optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
from torch import Tensor
from sklearn.metrics import *
import torch.nn.functional as F
import warnings
import math

In [None]:
def regular_encode(texts, tokenizer, maxlen=192):
    enc_di = tokenizer.batch_encode_plus(
        texts, return_token_type_ids=False, pad_to_max_length=True, max_length=maxlen
    )

    return np.array(enc_di["input_ids"])


class Transformer(nn.Module):
    def __init__(self, transformer, num_classes=1):
        super().__init__()

        self.transformer = transformer

        self.nb_features = self.transformer.pooler.dense.out_features
        self.pooler = nn.Sequential(
            nn.Linear(self.nb_features, num_classes),
            nn.Sigmoid(),
        )

    def forward(self, tokens):
        hidden_states = self.transformer(tokens, attention_mask=(tokens > 0).long())[1]


        ft = self.pooler(hidden_states)

        return ft


In [None]:
class bce(nn.Module):
    def __init__(self, weight=None, size_average=True):
        super(bce, self).__init__()

    def forward(self, inputs, targets, smooth=1):
        inputs = inputs.view(-1)
        targets = targets.view(-1)

        one = (1 - targets) * torch.log(1 - inputs)
        zero = targets * torch.log(inputs)
        loss = torch.mean((one + zero) * -1)

        return loss


class JigsawDataset:
    def __init__(self, x, y, is_test):
        super().__init__()
        self.y = y
        self.is_test = is_test
        self.sentences = x

    def __len__(self):
        return self.sentences.shape[0]

    def __getitem__(self, idx):
        len = self.__len__()
        if idx > len:
            idx = idx % len
        if self.is_test == 0:
            return torch.tensor(self.sentences[idx]), torch.tensor(self.y[idx]).float()
        else:
            return torch.tensor(self.sentences[idx])


In [None]:
def train_all(train_loader, model, device, optimizer):
    model.train()
    model.train()
    lss = bce()
    loss1 = []
    for step, (x, y_batch) in enumerate(train_loader):
        y_batch = y_batch.to(device)
        y_pred = model(x)

        loss = lss(y_pred.view(-1).float(), y_batch.float())
        loss.backward()
        loss1.append(loss.item())
        xm.optimizer_step(optimizer)

        model.zero_grad()
    return mean(loss1)


def valid_all(train_loader, model, device):
    lss = bce()
    loss1 = []
    for step, (x, y_batch) in enumerate(train_loader):
        y_batch = y_batch.to(device)
        y_pred = model(x)

        loss = lss(y_pred.view(-1).float(), y_batch.float())
        loss1.append(loss.item())

    return mean(loss1)


def predict_all(train_loader, model, device):
    predict = []
    for step, (x) in tqdm(enumerate(train_loader)):
        y_pred = model(x.to(device))
        predict.append(y_pred)

    return predict


def load_data(lang):
    trn = pd.read_csv(
        "../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-"
        + lang
        + "-cleaned.csv",
        usecols=["toxic", "comment_text"],
    )
    trn["lang"] = lang
    tst = pd.read_csv(
        "../input/jigsaw-multilingual-toxic-comment-classification/test.csv",
        usecols=["lang", "content"],
    )
    sub = pd.read_csv("../input/multilingual1/submission.csv")
    val = pd.read_csv(
        "../input/jigsaw-multilingual-toxic-comment-classification/validation.csv",
        usecols=["lang", "comment_text", "toxic"],
    )
    tst.columns = ["comment_text", "lang"]
    tst["toxic"] = sub["toxic"]
    df = pd.concat([trn, tst, val], 0)
    return df.loc[df["lang"] == lang].reset_index(drop=True).drop(["lang"], 1)


def get_lang(val, tst, lang):
    df = pd.concat([val, tst], 0)
    return df.loc[df["lang"] == lang].reset_index(drop=True).drop(["id", "lang"], 1)


def main():
    l1 = "fr"
    lang = "french"
    link_dk = {
        "fr": "camembert-base",
        "pt": "neuralmind/bert-base-portuguese-cased",
        "ru": "DeepPavlov/rubert-base-cased",
        "tr": "dbmdz/bert-base-turkish-cased",
        "es": "dccuchile/bert-base-spanish-wwm-cased",
        "it": "dbmdz/bert-base-italian-xxl-cased",
    }
    epochs = 1
    batch_size = 16
    learning_rate = 1e-5
    seed = 42

    # Setting seed
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    df = load_data(l1)
    tokenizer = AutoTokenizer.from_pretrained(link_dk[l1])
    x_train = regular_encode(list(df.comment_text.values), tokenizer, maxlen=192)
    y_train = df.toxic.values

    idx = df.loc[(df["toxic"] > 0) & (df["toxic"] < 1)].index
    test = x_train[idx]

    def run():
        torch.manual_seed(seed)

        device = xm.xla_device()
        model = AutoModel.from_pretrained(link_dk[l1])
        model = Transformer(model).to(device)

        # Training
        train_dataset = JigsawDataset(trn_x, trn_y, 0)

        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False,
        )
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=batch_size,
            sampler=train_sampler,
            drop_last=False,
            num_workers=2,
        )

        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=learning_rate * xm.xrt_world_size(),
            weight_decay=1e-3,
        )

        xm.master_print("Training is Starting ...... ")
        total_loss = []
        valid_loss = []
        predictions = []
        for i in tqdm(range(3)):
            para_loader = pl.ParallelLoader(train_loader, [device])
            total_loss.append(
                train_all(
                    para_loader.per_device_loader(device), model, device, optimizer
                )
            )

        state = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
        xm.save(state, lang + str(number))

    def _mp_fn(rank, flags):
        torch.set_default_tensor_type("torch.FloatTensor")
        run()

    kf = KFold(n_splits=5, random_state=42, shuffle=False)
    number = 0
    for train_index, test_index in kf.split(range(df.shape[0])):
        trn_x = x_train[train_index]
        trn_y = y_train[train_index]
        number += 1
        FLAGS = {}
        xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method="fork")


In [None]:
main()

In [None]:
def predict_all(train_loader, model, device, df, batch_size):
    predict = []
    for step, (x) in tqdm(enumerate(train_loader), total=df.shape[0] / (batch_size)):
        y_pred = model(x.to(device))
        predict.append(y_pred.cpu().detach().numpy())

    return predict


def load_data(lang):
    tst = pd.read_csv(
        "../input/jigsaw-multilingual-toxic-comment-classification/test.csv",
        usecols=["lang", "content"],
    )
    tst = tst.loc[tst["lang"] == lang].reset_index(drop=True).drop(["lang"], 1)
    print(tst.shape)
    return tst


def get_lang(val, tst, lang):
    df = pd.concat([val, tst], 0)
    return df.loc[df["lang"] == lang].reset_index(drop=True).drop(["id", "lang"], 1)


def main():
    epochs = 1
    seed = 42
    batch_size = 16
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    l1 = "fr"
    lang = "french"
    link_dk = {
        "fr": "camembert-base",
        "pt": "neuralmind/bert-base-portuguese-cased",
        "ru": "DeepPavlov/rubert-base-cased",
        "tr": "dbmdz/bert-base-turkish-cased",
        "es": "dccuchile/bert-base-spanish-wwm-cased",
        "it": "dbmdz/bert-base-italian-xxl-cased",
    }
    df = load_data(l1)
    tokenizer = AutoTokenizer.from_pretrained(link_dk[l1])
    x_train = regular_encode(list(df.content.values), tokenizer, maxlen=192)

    def run():
        torch.manual_seed(seed)

        device = xm.xla_device()
        model = AutoModel.from_pretrained(link_dk[l1])
        model = Transformer(model).to(device)
        model.load_state_dict(torch.load(lang + str(number))["state_dict"])

        train_dataset = JigsawDataset(x_train, None, 1)

        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset,
            num_replicas=xm.xrt_world_size(),
            rank=xm.get_ordinal(),
            shuffle=False,
        )
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=batch_size,
            sampler=train_sampler,
            drop_last=False,
            num_workers=2,
        )

        predictions = []
        para_loader = pl.ParallelLoader(train_loader, [device])
        predictions.append(
            predict_all(
                para_loader.per_device_loader(device), model, device, df, batch_size
            )
        )

        np.save(lang + "_predictions_" + str(number) + ".npy", predictions)

    for number in range(1, 6):
        run()


main()

(10920, 1)


683it [01:15,  9.03it/s]                           
683it [00:59, 11.55it/s]
683it [00:59, 11.53it/s]
683it [00:59, 11.51it/s]
683it [00:59, 11.53it/s]
