This code originated from Josh Ludan at https://www.kaggle.com/jmrludan/jigsaw-competition. I am simply organizing and annotating his work.

# Import libraries

In [None]:
import pandas as pd # dataframes
import numpy as np # math
from tqdm.auto import tqdm # tells you how much longer you need to wait for for-loops to end
tqdm.pandas()
from scipy import sparse # deals with sparse matricies
from scipy.sparse import coo_matrix
import torch # AKA pytorch
import torch.nn as nn #AKA pytorch neuroal net
import torch.nn.functional as F # more pytorch
import optuna # finds best parameters
from sklearn.model_selection import StratifiedKFold # used to cross

# libraries used to clean the text
from bs4 import BeautifulSoup # used to decode html
import re # regular expression library

# text embedding libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel

# modeling libraries
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#garbage collection
import gc

# memory variables
batch_size=100

In [None]:
import os # operating system library

# print out paths to input files
printInputFilePaths=False
if printInputFilePaths:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            print(os.path.join(dirname, filename))

In [None]:
# set seed for randomness
rseed=201

# Model without optimization

## Import Training Data
First let's import the training data into pandas data frames

In [None]:
# for testing the code
testing=False
if testing:
    train = pd.read_csv('../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv',nrows=1000)

In [None]:
if not testing:
    train = pd.DataFrame()

    train1 = pd.read_csv('../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv')
    train = train.append(train1, ignore_index=True)
    del train1

    train2 = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
    train = train.append(train2, ignore_index=True)
    del train2

    # train3 = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv')
    # train3['severe_toxic'] = train3['severe_toxicity']
    # train3['identity_hate'] = train3['identity_attack']
    # train3 = train3[['id','comment_text','toxic','severe_toxic','obscene','threat','insult','identity_hate']]
    # train = train.append(train3, ignore_index=True)
    # del train3

    train.drop_duplicates(subset=['id'], inplace=True)

## Handle Target Values

Now that we are done transforming the text, let's work on the target values. Below we set weights to the different target columns to generate a final single target we want.

In [None]:
FEATURE_WTS = {
    'severe_toxic': 1.5, 'identity_hate': 1.5, 'threat': 1.5, 
    'insult': 0.64, 'toxic': 0.32, 'obscene': 0.16, 
}

train['toxicity'] = list(train['severe_toxic'] * FEATURE_WTS['severe_toxic']+
                             train['toxic'] * FEATURE_WTS['toxic']+
                             train['obscene'] * FEATURE_WTS['obscene']+ 
                             train['threat'] * FEATURE_WTS['threat']+
                             train['insult'] * FEATURE_WTS['insult']+
                             train['identity_hate']* FEATURE_WTS['identity_hate'])

train = train.drop(columns=['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [None]:
train.sample(10)

### Imbalanced dataset
Below we run code to see if the comments have a relatively equal amount of toxic and non-toxic comments.

In [None]:
print ("The training dataset has %i rows." % len(train))
print ("The first training dataset has %i toxic comments." % ((train['toxicity'] > 0).sum()))
print ("The first training dataset has %i non-toxic comments." % ((train['toxicity'] == 0).sum()))

The dataset is very unbalanced. Below is code to undersample the majority class.

In [None]:
# undersample to the number of toxic comments (undersample_n)
undersample_n = (train['toxicity'] > 0).sum()

# perform undersample
train_undersample = train.loc[train['toxicity'] == 0,:].sample(
    n=undersample_n, random_state=rseed)

# generate new training dataframe given undersampled commets
train = pd.concat([train.loc[train['toxicity'] > 0,:], train_undersample])

In [None]:
print ("The training dataset has %i rows." % len(train))
print ("The first training dataset has %i toxic comments." % ((train['toxicity'] > 0).sum()))
print ("The first training dataset has %i non-toxic comments." % ((train['toxicity'] == 0).sum()))

## Clean Training Data
Now that we have the training data. We can clean the comments.

In [None]:
# function to clean raw text
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
train['comment_text'] = train['comment_text'].progress_apply(text_cleaning)

## Transform comments using a pretrained BERT model

Next, we want to apply a [pretrained sentence transformer](https://www.sbert.net/docs/pretrained_models.html) model to the text.

The goal of sentence embedding models is to project similar sentences closer together or alternatively further apart if the sentences are completely different. Specifically the model chosen here offers pretty good quality and is relatively fast compared to the model that is supposed to be the best. 

To run the pretrained model requires two steps. In the first step you split the sentences into lists of numbers where each number represents a word in the sentence (tokenization). The numbers come from a vocabulary generated when you run the command "AutoTokenizer.from_pretrained()." The second step is to run the results of the first step into a model to project the sentence.



In [None]:
# Load the tokenizer associated to the model
tokenizer = AutoTokenizer.from_pretrained(
    '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer',
    model_max_length=512)
# Load model 
model = AutoModel.from_pretrained(
    '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer')

# convert `model` to a CUDA optimized model
# AKA it makes better use of GPUs
device = torch.device("cpu")
model.to(device)


To run the BERT model on the comments we need to run the comments in batches. These are the steps:

1. Tokenize each of the comments using the pretrained BERT model. The output contain matricies of size:  NUMBER_OF_COMMENTS X model_max_length

2. Run the BERT model to project the sentences. The `model` will return `last_hidden_state` and `pooler_output`. The `last_hidden_state` is a matrix containing the sequence of hidden-states at the output of the last layer of the model. It is of size: NUMBER_OF_COMMENTS X model_max_length X NUM_HIDDEN_STATES. The `pooler_output` is a the last layer hidden-state of the first token of the sequence after  further processing through the layers used for the auxiliary pretraining task. It is a matrix of size: NUMBER_OF_COMMENTS X NUM_HIDDEN_STATES.

3. Perform mean pooling (multiply the results of the model by the attention mask and then average the columns). Note that an attention mask is a binary tensor indicating the position of the padded indices so that the model does not attend to them

4. L_p normalize the embeddings

In [None]:
def bertmodel(comment_series, tokenizer, model):
    # STEP 1: Tokenization
    # * Either pads or truncates the sentences to equal word lengths
    # * Converts the words to numbers
    # * Returns lists of embeddings
    encoded_input = tokenizer(list(comment_series),
                              padding=True,
                              truncation=True,
                              return_tensors='pt').to(device)
    
    # STEP 2: run BERT model
    with torch.no_grad():
        token_embeddings = model(**encoded_input)['last_hidden_state']
        
    # STEP 3: MEAN POOLING
    input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(token_embeddings.size())

    # save memory
    del encoded_input

    sentence_embeddings = (torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9))

    # save memory
    del token_embeddings
    del input_mask_expanded

    # STEP 4: L_p normalize embeddings
    sentence_embedding=F.normalize(sentence_embeddings, p=2, dim=1).tolist()
    
    return(sentence_embedding)

In [None]:
x2 = []
for i in tqdm(list(range(0,len(train),batch_size))):
    
    end_i=i+batch_size
    if end_i > len(train):
        end_i=len(train)
    train_sub=train[i:end_i]
    if i>0:
        x2.extend(
            (
             bertmodel(train_sub["comment_text"], tokenizer, model)))
    else:
        x2 = bertmodel(train_sub["comment_text"], tokenizer, model)

convert list of embeddings into a sparse matrix

In [None]:
x2 = sparse.csr_matrix(np.array(x2))

## Transform comments into a sparse word frequency matrix

In parallel to modeling the comments using a pretrained BERT model, we generate a sparse matrix based on the words in each comment we fit the text data into vectors using term frequency–inverse document frequency (TFIDF) library. This library will create a model which converts a list of comments into a sparse matrix format where columns are each of the "relevant words" from all of the comments, and rows are each of the comments. The values will be 0 if the word is not in the respective comment. Otherwise, the value will be equal to  the importance of the word in the document ([the TFIDF value](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)).

First we fit the model using the training comments.

In [None]:
vec = TfidfVectorizer(min_df= 10,
                      max_df=0.05,
                      analyzer = 'char_wb',
                      ngram_range = (3,5),
                      max_features = 1000)
vec.fit(train['comment_text'])

Then we transform the comments into a sparse matrix (NUMBER_OF_COMMENTS X VOCAB_SIZE) with the TFIDF model

In [None]:
x1 = vec.transform(train['comment_text'])
gc.collect()

## Stack the features

Now that we have the two ways of transforming the text we can combine them as two sets of features for training the model. We stack these features horizontally into a pytorch tensor called `X`

In [None]:
gc.collect()

X=None
with torch.no_grad():
    for i in tqdm(list(range(0,x1.shape[0],batch_size))):

        end_i = i + batch_size
        if end_i > x1.shape[0]:
            end_i=x1.shape[0]
        x1_sub = x1[i:end_i]
        x2_sub = x2[i:end_i]
        if i == 0:
            X = sparse.hstack(([x1_sub,x2_sub]))
        else:
            X = sparse.vstack([X, sparse.hstack([x1_sub,x2_sub])])

# save memory
del x1
del x2

## Custom Fuction for Embedding

Let's put all the embedding steps into a single function so that we don't have to do all those steps again...

In [None]:
def sentencesFeatureMatrix(comment_series, vec, tokenizer, model):
    # clean comments
    comment_series = comment_series.progress_apply(text_cleaning)
    
    # TFIDF matrix
    x1 = vec.transform(comment_series)
    
    # BERT Tokenizer
    x2 = []
    batch_size=100
    for i in tqdm(list(range(0,len(comment_series),batch_size))):

        end_i=i+batch_size
        if end_i > len(comment_series):
            end_i=len(comment_series)
        train_sub=comment_series[i:end_i]
        if i>0:
            x2.extend(
                (bertmodel(train_sub, tokenizer, model)))
        else:
            x2 = bertmodel(train_sub, tokenizer, model)
    x2 = sparse.csr_matrix(np.array(x2))
    # Stack the Embeddings
    X = sparse.hstack(([x1,x2]))

    del x1
    del x2
    
    return(X)

## Fit XGBoost Model

Cool done with those shenanigans. Time to fit a model to use transformed text matrix and return a toxicity values.

In [None]:
# Set the target values (Y)
Y = list(train['toxicity'])

#Training classifier to predict toxicity
xgb_model= xgb.XGBRegressor(max_depth=7,
    n_estimators=100, learning_rate=0.44,
    subsample = 0.25,tree_method = "hist").fit(
    X, Y)

## Validation of the Model

import validation data

In [None]:
val_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
val_df.sample(n=5)

In [None]:
if testing:
    val_df=val_df.head(n=10)

clean the validation text data

In [None]:
lessToxic_embeddings_list = sentencesFeatureMatrix(
    val_df['less_toxic'],vec, tokenizer, model)
gc.collect()

# predict the less_toxic toxicity values
val_df['less_toxic_score'] = xgb_model.predict(lessToxic_embeddings_list)

moreToxic_embeddings_list = sentencesFeatureMatrix(
    val_df['more_toxic'],vec, tokenizer, model)
gc.collect()

# predict the more_toxic toxicity values
val_df['more_toxic_score'] = xgb_model.predict(moreToxic_embeddings_list)

model_acc = (val_df['less_toxic_score'] < val_df['more_toxic_score']).mean()

In [None]:
print(model_acc)

## Run the model on submission data

Now that we trained the model let's transform the submission text and run the model

In [None]:
# import submission text data
to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
if testing:
    to_score=to_score.head(n=10)
to_score_embeddings_list = sentencesFeatureMatrix(to_score['text'], vec, tokenizer, model)

gc.collect()

# predict the toxicity values
to_score['score'] = xgb_model.predict(to_score_embeddings_list)

# lets see some results
to_score.sample(10)

Now format the submission

In [None]:
results = pd.DataFrame()
results['comment_id'] = to_score['comment_id']
results['score'] = to_score['score']
results.to_csv('submission.csv', index=False)

# Optimize parameters

To optimize the parameters we can use the validation dataset for training by transforming validation data to give each row a comment and a score.

In [None]:
if not 'val_df' in globals():
    val_df = pd.read_csv("/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv")
# get training data from validation set
less_toxic_score_df=pd.DataFrame()
less_toxic_score_df["comment_text"] = val_df["less_toxic"].copy()
less_toxic_score_df["toxicity"] = 0

more_toxic_score_df=pd.DataFrame()
more_toxic_score_df["comment_text"] = val_df["more_toxic"].copy()
more_toxic_score_df["toxicity"] = 1

toxic_score_df = pd.concat([less_toxic_score_df, more_toxic_score_df], ignore_index=True)

val_score_df = toxic_score_df.groupby('comment_text')['toxicity'].mean().reset_index()
val_score_df['tox_bin'] = (val_score_df['toxicity'] > 0).astype(int)

In [None]:
# function to normalize values between 0 and 1
def normFrac(nonNorm_series):
    return (nonNorm_series-nonNorm_series.min())/(nonNorm_series.max()-nonNorm_series.min())

We also need to set a loss function to compare comments

In [None]:
# get a dataframe where each comment has a predicted
# and true score, we split the comments in half
# evenly and then predict which score is more toxic
def marginRankLossF(df, predScoreCol, trueScoreCol):
    total_rows=len(df)
    if total_rows % 2 == 1:
        total_rows = total_rows - 1
    half_rows = total_rows // 2
    input1_df = df.loc[range(0,half_rows),:].copy().reset_index()
    input2_df = df.loc[range(half_rows,total_rows),:].copy().reset_index()
    input1 = torch.tensor(input1_df[predScoreCol].values, requires_grad=False)
    input2 = torch.tensor(input2_df[predScoreCol].values, requires_grad=False)
    target_series = (input1_df[trueScoreCol] > input2_df[trueScoreCol]).astype(int) 
    target_series = target_series.replace(0, -1)
    target=torch.tensor(target_series)
    loss = nn.MarginRankingLoss()
    output = loss(input1, input2, target)
    return(output.numpy())
                        

In [None]:
val_score_df.head()

In [None]:
del tokenizer
del model
del vec
del xgb_model
del to_score
del results

In [None]:
def objective(trial):
    
    if not 'train' in locals():
        
        # import training data
        train = pd.DataFrame()

        train1 = pd.read_csv('../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv')
        train = train.append(train1, ignore_index=True)
        del train1

        train2 = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
        train = train.append(train2, ignore_index=True)
        del train2

        train.drop_duplicates(subset=['id'], inplace=True)
    
    # optimize params
    # target subcategories
    obscene_w1 = trial.suggest_float('obscene_w1', 0, .5)
    toxic_w1 = trial.suggest_float('toxic_w1', 0, .5)
    threat_w1 = trial.suggest_float('threat_w1', .5, 1)
    insult_w1 = trial.suggest_float('insult_w1', 0, .5)
    severe_toxic_w1 = trial.suggest_float('severe_toxic_w1', .5, 1)
    identity_hate_w1 = trial.suggest_float('identity_hate_w1', .5, 1)
    # bert params
    min_df_param = trial.suggest_int('min_df_param', 5, 20, 5)
    max_bert_feat = trial.suggest_int('max_bert_feat', 100, 900, 200)
    # xgboost params
    max_depth_param = trial.suggest_int('max_depth_param', 5, 20)
    nest_param = trial.suggest_int('nest_param', 50, 350,50)
    learningRate_param = trial.suggest_float('learningRate_param', 0, .5)
        
    # get target values for training data
    toxicity_weights = {'obscene': obscene_w1, 'toxic': toxic_w1, 'threat': threat_w1, 
                    'insult': insult_w1, 'severe_toxic': severe_toxic_w1, 'identity_hate': identity_hate_w1}
    for cat in toxicity_weights:
        train[cat] = train[cat] * train[cat]
    train['toxicity'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
    train = train.drop(columns=['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
        
    error_results=[]
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=rseed)
    for train3_index, val_df2_index in skf.split(val_score_df['comment_text'], val_score_df['tox_bin']):
        
        train_df = pd.concat([train,val_score_df.loc[train3_index,:]])
        val_df2 = val_score_df.loc[val_df2_index,:]

        

        # undersample to the number of toxic comments (undersample_n)
        undersample_n = (train_df['toxicity'] > 0).sum()

        # perform undersample
        train_undersample = train_df.loc[train_df['toxicity'] == 0,:].sample(
            n=undersample_n, random_state=rseed)

        # generate new training dataframe given undersampled commets
        train_df = pd.concat([train_df.loc[train_df['toxicity'] > 0,:], train_undersample])
        
        del train_undersample
        
        # clean training data
        train_df['comment_text'] = train_df['comment_text'].progress_apply(text_cleaning)

        # Load the tokenizer associated to the BERT model
        tokenizer = AutoTokenizer.from_pretrained(
            '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer',
            model_max_length=512)
        # Load model associated to the BERT model
        model = AutoModel.from_pretrained(
            '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer')

        # convert `model` to a CUDA optimized model
        # AKA it makes better use of GPUs
        model.to(device)

        # fit TFIDF sparse matrix
        vec = TfidfVectorizer(min_df= min_df_param,
                          max_df = 0.05,
                          analyzer = 'char_wb',
                          ngram_range = (3,5),
                          max_features = max_bert_feat)
        vec.fit(train_df['comment_text'])

        # get embedding matrix for training data
        X = sentencesFeatureMatrix(train_df['comment_text'], vec, tokenizer, model)


        

        #Training classifier to predict toxicity
        xgb_model= xgb.XGBRegressor(
            max_depth=max_depth_param,
            n_estimators=nest_param,
            learning_rate=learningRate_param,
            subsample = 0.25,tree_method = "gpu_hist").fit(
            X, list(train_df['toxicity']))
    
        # run validation
        to_score_embeddings_list = sentencesFeatureMatrix(
            val_df2['comment_text'],vec, tokenizer, model)

        gc.collect()

        # predict the toxicity values
        val_df2['pred_tox'] = xgb_model.predict(to_score_embeddings_list)
        # normalize predictions and targets
        val_df2['toxicity']=normFrac(val_df2['toxicity'])
        val_df2['pred_tox']=normFrac(val_df2['pred_tox'])
        error_results.append(marginRankLossF(val_df2,'pred_tox','toxicity'))
        
        del train_df
        del val_df2
        del undersample_n
        del X
        del tokenizer
        del model
        del vec
        del to_score_embeddings_list
        del xgb_model
    print(error_results)
    return (sum(error_results) / len(error_results))

study = optuna.create_study()
study.optimize(objective, n_trials=10)
gc.collect()
best_param_dict = study.best_params
best_param_dict

Now I will reset the weights with the optimized values and generate a new submission.

In [None]:
# import training data
        
# import training data
train = pd.DataFrame()

train1 = pd.read_csv('../input/d/julian3833/jigsaw-toxic-comment-classification-challenge/train.csv')
train = train.append(train1, ignore_index=True)
del train1

train2 = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv')
train = train.append(train2, ignore_index=True)
del train2

train.drop_duplicates(subset=['id'], inplace=True)
        
# handle target values
train_df = train.copy()
toxicity_weights = {'obscene': study.best_params['obscene_w1'],
                    'toxic': study.best_params['toxic_w1'],
                    'threat': study.best_params['threat_w1'], 
                    'insult': study.best_params['insult_w1'],
                    'severe_toxic': study.best_params['severe_toxic_w1'],
                    'identity_hate': study.best_params['identity_hate_w1']}
for cat in toxicity_weights:
    train_df[cat] = train_df[cat] * toxicity_weights[cat]
        
train_df['toxicity'] = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1)
train_df = train_df.drop(columns=['id','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

val_score_df.loc[:,["comment_text","toxicity"]]

train_df = pd.concat([train_df,val_score_df])

# undersample to the number of toxic comments (undersample_n)
undersample_n = (train_df['toxicity'] > 0).sum()

# perform undersample
train_undersample = train_df.loc[train_df['toxicity'] == 0,:].sample(
    n=undersample_n, random_state=rseed)

# generate new training dataframe given undersampled commets
train_df = pd.concat([train_df.loc[train_df['toxicity'] > 0,:], train_undersample])

# clean training data
train_df['comment_text'] = train_df['comment_text'].progress_apply(text_cleaning)

# Load the tokenizer associated to the BERT model
tokenizer = AutoTokenizer.from_pretrained(
    '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer',
    model_max_length=512)
# Load model associated to the BERT model
model = AutoModel.from_pretrained(
    '../input/sentence-transformer-pretrained-models/pre_trained_models/paraphrase-MiniLM-L6-v2/0_Transformer')

# convert `model` to a CUDA optimized model
# AKA it makes better use of GPUs
device = torch.device("cpu")
model.to(device)

# fit TFIDF sparse matrix
vec = TfidfVectorizer(min_df= study.best_params['min_df_param'],
                  max_df=0.05,
                  analyzer = 'char_wb',
                  ngram_range = (3,5),
                  max_features = study.best_params['max_bert_feat'])
vec.fit(train_df['comment_text'])

# get embedding matrix for training data
X = sentencesFeatureMatrix(train_df['comment_text'], vec, tokenizer, model)


#Training classifier to predict toxicity
xgb_model= xgb.XGBRegressor(
    max_depth=study.best_params['max_depth_param'],
    n_estimators=study.best_params['nest_param'], 
    learning_rate=study.best_params['learningRate_param'],
    subsample = 0.25,tree_method = "hist").fit(
    X, list(train_df['toxicity']))

In [None]:
# import submission text data
to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')

to_score_embeddings_list = sentencesFeatureMatrix(to_score['text'], vec, tokenizer, model)

gc.collect()

# predict the toxicity values
to_score['score'] = xgb_model.predict(to_score_embeddings_list)

# lets see some results
to_score.sample(10)

In [None]:
results = pd.DataFrame()
results['comment_id'] = to_score['comment_id']
results['score'] = to_score['score']
results.to_csv('submission.csv', index=False)