## Base Ridge Ensemble

In [None]:
import gc
import nltk
import numpy as np
import pandas as pd
from scipy import sparse
from pprint import pprint
from nltk.corpus import stopwords
from IPython.display import display
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings; warnings.filterwarnings("ignore")

df = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
for col in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']: display(df.loc[df[col] == 1, ['comment_text', col]].sample(10))

df['severe_toxic'] = df.severe_toxic * 2
df['y'] = (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis = 1)).astype(int)
df['y'] = df['y'] / df['y'].max()
df = df[['comment_text', 'y']].rename(columns = {'comment_text': 'text'})

n_folds = 7
frac_1 = 0.4
frac_1_factor = 1.5
for fld in range(n_folds):
    tmp_df = pd.concat([df[df.y > 0].sample(frac = frac_1, random_state = 10 * (fld + 1)), df[df.y == 0].sample(n = int(len(df[df.y > 0]) * frac_1 * frac_1_factor), random_state = 10 * (fld + 1))], axis = 0).sample(frac = 1, random_state = 10 * (fld + 1))
    tmp_df.to_csv(f'/kaggle/working/df_fld{fld}.csv', index = False)

stop = stopwords.words('english')
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text): return [lemmatizer.lemmatize(w) for w in text]

def clean(data, col):
    data[col] = data[col].str.replace(r"what's", "what is ")
    data[col] = data[col].str.replace(r"\'ve", " have ")
    data[col] = data[col].str.replace(r"can't", "cannot ")
    data[col] = data[col].str.replace(r"n't", " not ")
    data[col] = data[col].str.replace(r"i'm", "i am ")
    data[col] = data[col].str.replace(r"\'re", " are ")
    data[col] = data[col].str.replace(r"\'d", " would ")
    data[col] = data[col].str.replace(r"\'ll", " will ")
    data[col] = data[col].str.replace(r"\'scuse", " excuse ")
    data[col] = data[col].str.replace(r"\'s", " ")
    data[col] = data[col].str.replace('\n', ' \n ')
    data[col] = data[col].str.replace(r'([a-zA-Z]+)([/!?.])([a-zA-Z]+)', r'\1 \2 \3')
    data[col] = data[col].str.replace(r'([*!?\'])\1\1{2,}', r'\1\1\1')
    data[col] = data[col].str.replace(r'([*!?\']+)', r' \1 ')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1{2,}\b', r'\1\1')
    data[col] = data[col].str.replace(r'([a-zA-Z])\1\1{2,}\B', r'\1\1\1')
    data[col] = data[col].str.replace(r'[ ]{2,}', ' ').str.strip()
    data[col] = data[col].str.replace(r'[ ]{2,}', ' ').str.strip()
    data[col] = data[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    return data

test_clean_df = pd.DataFrame({"text": ["heyy\n\nkkdsfj", "hi   how/are/you ???", "hey?????", "noooo!!!!!!!!!   comeone !! ", "cooooooooool     brooooooooooo  coool brooo", "naaaahhhhhhh"]})
display(test_clean_df)
clean(test_clean_df, 'text')
df = clean(df, 'text')

n_folds = 7
frac_1 = 0.3
frac_1_factor = 1.2
for fld in range(n_folds):
    tmp_df = pd.concat([df[df.y > 0].sample(frac = frac_1, random_state = 10 * (fld + 1)), df[df.y == 0].sample(n = int(len(df[df.y > 0]) * frac_1 * frac_1_factor), random_state = 10 * (fld + 1))], axis = 0).sample(frac = 1, random_state = 10 * (fld + 1))
    tmp_df.to_csv(f'/kaggle/working/df_clean_fld{fld}.csv', index = False)
del df, tmp_df
gc.collect()

df_ = pd.read_csv("../input/ruddit-jigsaw-dataset/Dataset/ruddit_with_text.csv")
df_ = df_[['txt', 'offensiveness_score']].rename(columns = {'txt': 'text', 'offensiveness_score': 'y'})
df_['y'] = (df_['y'] - df_.y.min()) / (df_.y.max() - df_.y.min())

n_folds = 7
frac_1 = 0.7
for fld in range(n_folds):
    tmp_df = df_.sample(frac = frac_1, random_state = 10 * (fld + 1))
    tmp_df.to_csv(f'/kaggle/working/df2_fld{fld}.csv', index = False)
del tmp_df, df_
gc.collect()

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

class LengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None): return self
    def transform(self, X): return sparse.csr_matrix([[(len(x) - 360) / 550] for x in X])
    def get_feature_names(self): return ["lngth"]

class LengthUpperTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y = None): return self
    def transform(self, X): return sparse.csr_matrix([[sum([1 for y in x if y.isupper()]) / len(x)] for x in X])
    def get_feature_names(self): return ["lngth_uppercase"]

df_val['upper_1'] = np.array(LengthUpperTransformer().transform(df_val['less_toxic']).todense()).reshape(-1, 1)
df_val['upper_2'] = np.array(LengthUpperTransformer().transform(df_val['more_toxic']).todense()).reshape(-1, 1)
df_val['upper_1'].hist(bins = 100)
df_val['upper_2'].hist(bins = 100)
val_preds_arr1 = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2 = np.zeros((df_val.shape[0], n_folds))
test_preds_arr = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    df = pd.read_csv(f'/kaggle/working/df_fld{fld}.csv')
    features = FeatureUnion([("vect3", TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5)))])
    pipeline = Pipeline([("features", features), ("clf", Ridge())])
    pipeline.fit(df['text'], df['y'])
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), np.round(pipeline['clf'].coef_, 2))), key = lambda x: x[1], reverse = True)
    pprint(feature_wts[:30])
    val_preds_arr1[:, fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2[:, fld] = pipeline.predict(df_val['more_toxic'])
    test_preds_arr[:, fld] = pipeline.predict(df_sub['text'])

val_preds_arr1c = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2c = np.zeros((df_val.shape[0], n_folds))
test_preds_arrc = np.zeros((df_sub.shape[0], n_folds))

for fld in range(n_folds):
    df = pd.read_csv(f'/kaggle/working/df_clean_fld{fld}.csv')
    features = FeatureUnion([("vect3", TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5)))])
    pipeline = Pipeline([("features", features), ("clf", Ridge())])
    pipeline.fit(df['text'], df['y'])
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), np.round(pipeline['clf'].coef_, 2))), key = lambda x: x[1], reverse = True)
    pprint(feature_wts[:30])
    val_preds_arr1c[:, fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2c[:, fld] = pipeline.predict(df_val['more_toxic'])
    test_preds_arrc[:, fld] = pipeline.predict(df_sub['text'])

val_preds_arr1_ = np.zeros((df_val.shape[0], n_folds))
val_preds_arr2_ = np.zeros((df_val.shape[0], n_folds))
test_preds_arr_ = np.zeros((df_sub.shape[0], n_folds))
for fld in range(n_folds):
    df = pd.read_csv(f'/kaggle/working/df2_fld{fld}.csv')
    features = FeatureUnion([("vect3", TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5)))])
    pipeline = Pipeline([("features", features), ("clf", Ridge())])
    pipeline.fit(df['text'], df['y'])
    feature_wts = sorted(list(zip(pipeline['features'].get_feature_names(), np.round(pipeline['clf'].coef_, 2))), key = lambda x: x[1], reverse = True)
    pprint(feature_wts[:30])
    val_preds_arr1_[:, fld] = pipeline.predict(df_val['less_toxic'])
    val_preds_arr2_[:, fld] = pipeline.predict(df_val['more_toxic'])
    test_preds_arr_[:, fld] = pipeline.predict(df_sub['text'])

del df, pipeline, feature_wts
gc.collect()

p5 = val_preds_arr1c.mean(axis = 1)
p6 = val_preds_arr2c.mean(axis = 1)
p5 = val_preds_arr1c.mean(axis = 1)
p6 = val_preds_arr2c.mean(axis = 1)
p1 = val_preds_arr1.mean(axis = 1)
p2 = val_preds_arr2.mean(axis = 1)
p3 = val_preds_arr1_.mean(axis = 1)
p4 = val_preds_arr2_.mean(axis = 1)
p5 = val_preds_arr1c.mean(axis = 1)
p6 = val_preds_arr2c.mean(axis = 1)

wts_acc = []
for i in range(30, 70, 1):
    for j in range(0, 20, 1):
        w1 = i / 100
        w2 = (100 - i - j) / 100
        w3 = (1 - w1 - w2)
        p1_wt = w1 * p1 + w2 * p3 + w3 * p5
        p2_wt = w1 * p2 + w2 * p4 + w3 * p6
        wts_acc.append((w1, w2, w3, np.round((p1_wt < p2_wt).mean() * 100, 2)))

w1, w2, w3, _ = sorted(wts_acc, key = lambda x: x[2], reverse = True)[0]
p1_wt = w1 * p1 + w2 * p3 + w3 * p5
p2_wt = w1 * p2 + w2 * p4 + w3 * p6
df_val['p1'] = p1_wt
df_val['p2'] = p2_wt
df_val['diff'] = np.abs(p2_wt - p1_wt)
df_val['correct'] = (p1_wt < p2_wt).astype('int')
df_val[df_val.correct == 0].sort_values('diff', ascending = True).head(20)
df_val[df_val.correct == 0].sort_values('diff', ascending = False).head(20)
df_sub['score'] = w1 * test_preds_arr.mean(axis = 1) + w2 * test_preds_arr_.mean(axis = 1) + w3 * test_preds_arrc.mean(axis = 1)
df_sub['score'].count() - df_sub['score'].nunique()

same_score = df_sub['score'].value_counts().reset_index()[:10]
df_sub[df_sub['score'].isin(same_score['index'].tolist())]

## RoBERTa Ensemble

In [None]:
import os
import gc
import cv2
import copy
import time
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel,AutoConfig

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


class Config:
    model_name = '../input/roberta-base'
    learning_rate = 1e-4
    epochs = 1
    train_bs =32
    valid_bs = 64
    test_bs = 128
    seed = 2021
    max_length = 128
    min_lr = 1e-7
    scheduler = 'CosineAnnealingLR'
    T_max  = 500
    weight_decay = 1e-6
    max_grad_norm = 1.0
    num_classes = 1
    margin = 0.5
    n_fold = 5
    n_accululate = 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    hidden_size = 768
    num_hidden_layers = 24
    dropout = 0.2


tokenizer = AutoTokenizer.from_pretrained(Config.model_name)

MODEL_PATHS = [
    '../input/robertabase5fold2-linear-256/Loss-Fold-0.bin',
    '../input/robertabase5fold2-linear-256/Loss-Fold-1.bin',
    '../input/robertabase5fold2-linear-256/Loss-Fold-2.bin',
    '../input/robertabase5fold2-linear-256/Loss-Fold-3.bin',
    '../input/robertabase5fold2-linear-256/Loss-Fold-4.bin'
]

def set_seed(seed = 42):

    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(Config.seed)


df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }


test_dataset = JigsawDataset(df, tokenizer, max_length=Config.max_length)
test_loader = DataLoader(test_dataset, batch_size=Config.test_bs, num_workers=2, shuffle=False, pin_memory=True)

class JModel(nn.Module):
    def __init__(self, checkpoint=Config.model_name, Config=Config):
        super(JModel, self).__init__()
        self.checkpoint = checkpoint
        self.bert = AutoModel.from_pretrained(checkpoint, return_dict=False)
        self.layer_norm = nn.LayerNorm(Config.hidden_size)
        self.dropout = nn.Dropout(Config.dropout)
        self.dense = nn.Sequential(
            nn.Linear(Config.hidden_size, 256),
            nn.LeakyReLU(negative_slope=0.01),
            nn.Dropout(Config.dropout),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.layer_norm(pooled_output)
        pooled_output = self.dropout(pooled_output)
        preds = self.dense(pooled_output)
        return preds        


@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()

    dataset_size = 0
    running_loss = 0.0

    PREDS = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 

    PREDS = np.concatenate(PREDS)
    gc.collect()

    return PREDS

def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JModel(Config.model_name)
        model.to(Config.device)
        model.load_state_dict(torch.load(path))

        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)

    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds    

preds = inference(MODEL_PATHS, test_loader, Config.device)    
df['score'] = preds
df['score'] = df['score'].rank(method='first')
df.drop('text', axis=1, inplace=True)
df.to_csv("submission_bert.csv", index=False)

# RoBERTa Ensemble 2

In [None]:
import os
import gc
import cv2
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

CONFIG = dict(
    seed = 42,
    model_name = '../input/roberta-base',
    test_batch_size = 64,
    max_length = 128,
    num_classes = 1,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

MODEL_PATHS = [
    '../input/pytorch-w-b-jigsaw-starter/Loss-Fold-0.bin',
    '../input/pytorch-w-b-jigsaw-starter/Loss-Fold-1.bin',
    '../input/pytorch-w-b-jigsaw-starter/Loss-Fold-2.bin',
    '../input/pytorch-w-b-jigsaw-starter/Loss-Fold-3.bin',
    '../input/pytorch-w-b-jigsaw-starter/Loss-Fold-4.bin'
]

def set_seed(seed = 42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)


class JigsawDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']        

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long)
        }    


class JigsawModel(nn.Module):
    def __init__(self, model_name):
        super(JigsawModel, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])

    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()

    dataset_size = 0
    running_loss = 0.0

    PREDS = []

    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        PREDS.append(outputs.view(-1).cpu().detach().numpy()) 

    PREDS = np.concatenate(PREDS)
    gc.collect()

    return PREDS


def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = JigsawModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))

        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)

    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds


set_seed(CONFIG['seed'])
df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df.head()

test_dataset = JigsawDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False, pin_memory=True)

preds1 = inference(MODEL_PATHS, test_loader, CONFIG['device'])
preds = pd.read_csv('submission_bert.csv')['score'].values
preds = (preds-preds.min())/(preds.max()-preds.min())
preds2 = (preds1-preds1.min())/(preds1.max()-preds1.min())

## Ensembling with (TFIDF Ridge & FastText Ensemble)

In [None]:
import re
import numpy as np
import pandas as pd
from scipy import sparse
from bs4 import BeautifulSoup
from sklearn.linear_model import Ridge
from gensim.models import KeyedVectors, FastText
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings; warnings.filterwarnings("ignore")

N_MODELS = 4
EXTRA_DIM = 256
ALPHA_STEP_SIZE = 0.5

def text_cleaning(text):
    template = re.compile(r'https?://\S+|www\.\S+')
    text = template.sub(r'', text)
    soup = BeautifulSoup(text, 'lxml')
    only_text = soup.get_text()
    text = only_text
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"
                               u"\U0001F300-\U0001F5FF"
                               u"\U0001F680-\U0001F6FF"
                               u"\U0001F1E0-\U0001F1FF"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags = re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    text = re.sub(r"[^a-zA-Z\d]", " ", text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

df = pd.read_csv('../input/jigsaw-regression-based-data/train_data_version2.csv')
df = df.dropna(axis = 0)
vec = TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5), max_features = 46000)
vec.fit(df['text'])
fmodel = FastText.load('../input/jigsaw-regression-based-data/FastText-jigsaw-256D/Jigsaw-Fasttext-Word-Embeddings-256D.bin')

def splitter(text): return [word for word in text.split(' ')]
def vectorizer(text):
    tokens = splitter(text)
    x1 = vec.transform([text]).toarray()
    x2 = np.mean(fmodel.wv[tokens], axis = 0).reshape(1, -1)
    x = np.concatenate([x1, x2], axis = -1).astype(np.float16)
    del x1
    del x2
    return x

X_np = np.array([vectorizer(text) for text in df.text]).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
X = sparse.csr_matrix(X_np)
del X_np

class RidgeEnsemble():
    def __init__(self, n_models = 4, alpha_step_size = 0.5): self.models = [Ridge(alpha = alpha) for alpha in [alpha_step_size * i for i in range(1, n_models + 1)]]
    def fit(self, X, y): self.models = [model.fit(X, y) for model in self.models]
    def predict(self, X): return np.mean(np.concatenate([np.expand_dims(model.predict(X), axis = 0) for model in self.models], axis = 0), axis = 0)

model = RidgeEnsemble()
model.fit(X, df['y'])

df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

X_less_toxic_temp = []
for text in df_val.less_toxic: X_less_toxic_temp.append(vectorizer(text))
X_less_toxic_temp = np.array(X_less_toxic_temp).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
X_less_toxic = sparse.csr_matrix(X_less_toxic_temp)
del X_less_toxic_temp

X_more_toxic_temp = []
for text in df_val.more_toxic: X_more_toxic_temp.append(vectorizer(text))
X_more_toxic_temp = np.array(X_more_toxic_temp).reshape(-1, (len(vec.vocabulary_) + EXTRA_DIM))
X_more_toxic = sparse.csr_matrix(X_more_toxic_temp)
del X_more_toxic_temp

df_sub2 = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
df_sub2['text'] = df_sub2['text'].apply(text_cleaning)
X_sub_temp = []
for text in df_sub2.text: X_sub_temp.append(vectorizer(text))
X_sub_temp = np.array(X_sub_temp).reshape(-1, (len(vec.vocabulary_) + 256))
X_test = sparse.csr_matrix(X_sub_temp)
del X_sub_temp

df_sub2['score'] = model.predict(X_test)
df_sub2['score'] = df_sub2['score']
df_sub2[['comment_id', 'score']].to_csv("submission.csv", index = False)

In [None]:
import os; os.environ['TOKENIZERS_PARALLELISM'] = 'false'
import torch
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer

In [None]:
class Dataset:
    """
    For comments_to_score.csv (the submission), get only one comment per row
    """
    def __init__(self, text, tokenizer, max_len):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = str(self.text[item])
        inputs = self.tokenizer(
            text, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True
        )

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long)
        }
    

def generate_predictions(model_path, max_len, is_multioutput):
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    model.to("cuda")
    model.eval()
    
    df = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
    
    dataset = Dataset(text=df.text.values, tokenizer=tokenizer, max_len=max_len)
    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=32, num_workers=2, pin_memory=True, shuffle=False
    )

    final_output = []

    for data in data_loader:
        with torch.no_grad():
            for key, value in data.items():
                data[key] = value.to("cuda")
            output = model(**data)
            
            if is_multioutput:
                # Sum the logits for all the toxic labels
                # One strategy out of various possible
                output = output.logits.sum(dim=1)
            else:
                # Classifier. Get logits for "toxic"
                output = output.logits[:, 1]
            
            output = output.detach().cpu().numpy().tolist()
            final_output.extend(output)
    
    torch.cuda.empty_cache()
    return np.array(final_output)

In [None]:
preds1 = generate_predictions("../input/toxic-bert", max_len=192, is_multioutput=True)
preds2 = generate_predictions("../input/hugging-face-models/toxic-detector-distilroberta", max_len=192, is_multioutput=True)
preds3 = generate_predictions("../input/hugging-face-models/BERT-Jigsaw", max_len=192, is_multioutput=False)

In [None]:
from sklearn.preprocessing import MinMaxScaler

hf_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")
hf_sub["score_bert"] = preds1
hf_sub["score_distilrob"] = preds2
hf_sub["score_bertjig"] = preds3

sc = MinMaxScaler()
hf_sub[["score_bert", "score_distilrob", "score_bertjig"]] = sc.fit_transform(hf_sub[["score_bert", "score_distilrob", "score_bertjig"]])

hf_sub["score"] = hf_sub[["score_bert", "score_distilrob", "score_bertjig"]].mean(axis=1)

print(hf_sub.duplicated('score').value_counts())

hf_sub.head()

In [None]:
df_sub['score'] = (0.4 * ((df_sub['score'] * 0.94) + (preds * 0.06)) ) +  (0.4 * df_sub2['score']) + (0.2 * hf_sub['score'])

print(df_sub.duplicated('score').value_counts())

df_sub['score'] = df_sub['score'].rank(method='first')
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
# if not is_private():
#     df_sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
#     df_sub['score'] = 0.0
#     df_sub.to_csv('submission.csv', index = False)