In [3]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [None]:
train = pd.read_csv('../input/train.csv.zip').fillna(' ')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat',
    'insult', 'identity_hate']
train = pd.read_csv('train.csv.zip').fillna(' ')
test = pd.read_csv('test.csv.zip').fillna(' ')
train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000
)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [7]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    ngram_range=(2, 6),
    max_features=50000
)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



In [8]:
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])

In [None]:
scores = []
submission = pd.DataFrame.from_dict({'id': test['id']})
for class_name in class_names:
    train_target = train[class_name]
    classifier = LogisticRegression(C=0.1, solver='sag')
    
    cv_score = np.mean(cross_val_score(classifier, train_features,
        train_target, cv=3, scoring='roc_auc'))
    scores.append(cv_score)

    print('CV score for class {} is {}'.format(class_name, cv_score))
    classifier.fit(train_features, train_target)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]

In [12]:
print('Total CV score is {}'.format(np.mean(scores)))

Total CV score is 0.9802260391883925


In [13]:
submission.to_csv('submission.csv', index=False)

## Text preprocessing and cleanup

In [15]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [16]:
import os, math, operator, csv, random, pickle,re
import gc
from nltk.tokenize import TweetTokenizer
#from spacy.symbols import nsubj, VERB, dobj
import spacy
import en_core_web_sm
from unidecode import unidecode
from sklearn.model_selection import KFold, train_test_split
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
sys.setrecursionlimit(1500)

In [None]:
with open(hyphens_filepath, mode='rb') as file:
    hyphens_dict = pickle.load(file)
with open(misspellings_filepath, mode='rb') as file:
    misspellings_dict = pickle.load(file)
with open(merged_filepath, mode='rb') as file:
    merged_dict = pickle.load(file)
with open(toxic_words_filepath, mode='rb') as file:
    toxic_words = pickle.load(file)
with open(asterisk_words_filepath, mode='rb') as file:
    asterisk_words = pickle.load(file)
with open(fasttext_filepath, mode='rb') as file:
    fasttext_misspelings = pickle.load(file)
    
print(len(hyphens_dict))
print(len(misspellings_dict))
print(len(merged_dict))
print(len(toxic_words))
print(len(asterisk_words))
print(len(fasttext_misspelings))

In [None]:
TEXT_COLUMN = 'comment_text'
list_classes = ["toxic", "severe_toxic", "obscene",
    "threat", "insult", "identity_hate"]
CHARS_TO_REMOVE = """!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n""'∞θ÷α•à−β∅³π‘₹´°£€\×™√²—"""

submission = pd.read_csv("../input/sample_submission.csv.zip")

categories = ["toxic", "severe_toxic", "obscene",
    "threat", "insult", "identity_hate"]

data_folder = "../input/"
pretrained_folder = "../input/"
train_filepath = data_folder + "train.csv.zip"
test_filepath = data_folder + "test.csv.zip"

submission_path = data_folder + "submission.csv"

hyphens_filepath = "../input/cleaning-dictionaries/hyphens_dictionary.bin"
misspellings_filepath = (
"../input/cleaning-dictionaries/misspellings_all_dictionary.bin")
merged_filepath = "../input/cleaning-dictionaries/merged_all_dictionary.bin"
toxic_words_filepath = "../input/cleaning-dictionaries/toxic_words.bin"
asterisk_words_filepath = (
"../input/cleaning-dictionaries/asterisk_words.bin")
fasttext_filepath = "../input/cleaning-dictionaries/merged_all_dictionary.bin"

In [46]:
training_samples_count = 149571
validation_samples_count = 10000
length_threshold = 20000 # Truncate comments longer than this character length
word_count_threshold = 900 # Truncate comments with more than this many words
words_limit = 310000
valid_characters = (
    " " + "@$" + "'!?-" + "abcdefghijklmnopqrstuvwxyz"
    + "abcdefghijklmnopqrstuvwxyz".upper()
)
valid_characters_ext = valid_characters + "abcdefghijklmnopqrstuvwxyz".upper()
valid_set = set(x for x in valid_characters)
valid_set_ext = set(x for x in valid_characters_ext)

In [47]:
cont_patterns = [
    (r'(W|w)on\'t', r'will not'),
    (r'(C|c)an\'t', r'can not'),
    (r'(I|i)\'m', r'i am'),
    (r'(A|a)in\'t', r'is not'),
    (r'(\w+)\'ll', r'\g<1> will'),
    (r'(\w+)n\'t', r'\g<1> not'),
    (r'(\w+)\'ve', r'\g<1> have'),
    (r'(\w+)\'s', r'\g<1> is'),
    (r'(\w+)\'re', r'\g<1> are'),
    (r'(\w+)\'d', r'\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

In [49]:
def split_word(word, toxic_words):
    if word == "":
        return ""
        
    lower = word.lower()
    for toxic_word in toxic_words:
        start = lower.find(toxic_word)
        if start >= 0:
            end = start + len(toxic_word)
            result = " ".join([word[0:start], word[start:end],
                split_word(word[end:], toxic_words)])
            return result.replace(" ", " ").strip()
    return word

In [50]:
tknzr = TweetTokenizer(strip_handles=False, reduce_len=True)

def word_tokenize(sentence):
    sentence = sentence.replace("$", "s")
    sentence = sentence.replace("@", "a")
    sentence = sentence.replace("!", " ! ")
    sentence = sentence.replace("?", " ? ")
    return tknzr.tokenize(sentence)

In [51]:
def replace_url(word):
    if ("http://" in word or "www." in word or "https://" in word
        or "wikipedia.org" in word):
        return ""
    return word

In [52]:
def normalize_by_dictionary(normalized_word, dictionary):
    result = []
    for word in normalized_word.split():
        if word == word.upper():
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()].upper())
            else:
                result.append(word)
        else:
            if word.lower() in dictionary:
                result.append(dictionary[word.lower()])
            else:
                result.append(word)
    return " ".join(result)

In [53]:
nlp = en_core_web_sm.load()

In [None]:
def normalize_comment(comment):
    comment = unidecode(comment)
    comment = comment[:length_threshold]
    
    # Replace known asterisk patterns
    for w in asterisk_words:
        if w[0] in comment:
            comment = comment.replace(w[0], w[1])
        if w[0].upper() in comment:
            comment = comment.replace(w[0].upper(), w[1].upper())

    normalized_words = []
    for word in word_tokenize(comment):
        word = replace_url(word)
        if word.count(".") == 1:
            word = word.replace(".", " ")
        filtered_word = "".join([x for x in word if x in valid_set])

        # Split toxic words inside larger tokens
        normalized_word = split_word(filtered_word, toxic_words)

        # Apply multiple dictionary normalizations
        normalized_word = normalize_by_dictionary(
        normalized_word, hyphens_dict)
        normalized_word = normalize_by_dictionary(
        normalized_word, merged_dict)
        normalized_word = normalize_by_dictionary(
        normalized_word, misspellings_dict)
        normalized_word = normalize_by_dictionary(
        normalized_word, fasttext_misspelings)
        normalized_words.append(normalized_word)
    
    # Convert words to lowercase unless fully uppercase
    normalized_comment = " ".join(normalized_words)
    result = []
    for word in normalized_comment.split():
        if word.upper() == word:
            result.append(word)
        else:
            result.append(word.lower())
    
    result = " ".join(result)
    # Merge certain specific words
    if "sock puppet" in result:
    result = result.replace("sock puppet", "sockpuppet")
    if "SOCK PUPPET" in result:
    result = result.replace("SOCK PUPPET", "SOCKPUPPET")
        
    return result

In [None]:
def read_data_files(train_filepath, test_filepath):
    # read train data
    train = pd.read_csv(train_filepath)
    labels = train[categories].values

    # read test data
    test = pd.read_csv(test_filepath)
    test_comments = test["comment_text"].fillna("_na_").values

    # normalize comments
    np_normalize = np.vectorize(normalize_comment)
    comments = train["comment_text"].fillna("_na_").values
    normalized_comments = np_normalize(comments)
    del comments
    gc.collect()

    comments = test["comment_text"].fillna("_na_").values
    normalized_test_comments = np_normalize(test_comments)
    del comments
    gc.collect()

    print('Shape of data tensor:', normalized_comments.shape)
    print('Shape of label tensor:', labels.shape)
    print('Shape of test data tensor:', normalized_test_comments.shape)

    return (labels, normalized_comments, normalized_test_comments)

labels, x_train, x_test = read_data_files(train_filepath, test_filepath)

NameError: name 'normalize_comment' is not defined

In [None]:
np.save("../cleaned_data/lables", labels)
np.save("../cleaned_data/x_train", x_train)
np.save("../cleaned_data/x_test", x_test)

## Text classification with RNNs

In [None]:
import os, math, operator, csv, random, pickle, re
import pandas as pd
import numpy as np
import gc
import tensorflow as tf

print(tf.__version__)
tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)

from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    MaxPooling1D, BatchNormalization, Permute, Lambda, Activation, Conv1D,
    GlobalAveragePooling1D, GlobalMaxPooling1D, Dense, Embedding, Dropout,
    Input, Flatten, TimeDistributed, concatenate, SpatialDropout1D,
    Bidirectional, LSTM, GRU, add
)
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend as K
from nltk.tokenize import TweetTokenizer
from unidecode import unidecode
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [None]:
# Load all the preprocessed data as numpy text arrays.
labels = np.load('../input/labels.npy')
x_train = np.load('../input/x_train.npy')
x_test = np.load('../input/x_test.npy')
fileObject = open('../dictionaries/tokenizer','rb')
tokenizer = pickle.load(fileObject)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
# Load the dual embeddings matrix:
embedding_matrix = np.load('../embeddings/embedding_matrix_big.npy')

In [None]:
# split the train data into the train and validation sets
x_train, x_valid, y_train, y_valid = train_test_split(
x_train, labels, test_size = 0.1)

In [None]:
# Define the Keras model
def build_model(embedding_matrix):
    words = Input(shape=(None,))
    
    # Embedding Layer
    x = Embedding(*embedding_matrix.shape,
                  weights=[embedding_matrix],
                  trainable=False)(words)
    x = SpatialDropout1D(0.2)(x)
    
    # Recurrent Layers
    x = Bidirectional(GRU(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    
    # Pooling & Concatenation
    hidden = concatenate([
        GlobalMaxPooling1D()(x),
        GlobalAveragePooling1D()(x),
    ])
    
    # Residual Connections (Skip Connections)
    hidden = add([
        hidden,
        Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)
    ])
    hidden = add([
        hidden,
        Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)
    ])
    
    # Output Layer
    result = Dense(6, activation='sigmoid')(hidden)
    
    model = Model(inputs=words, outputs=result)
    model.compile(loss='binary_crossentropy', optimizer='adam')
    
    return model

In [None]:
# Train the model and make predictions on the test set.
# In order to improve performance we use a 10 seed average.
EPOCHS = 5
SEEDS = 10
pred = 0
for ii in range(SEEDS):
    model = build_model(embedding_matrix)
    for global_epoch in range(EPOCHS):
        print(global_epoch)
        model.fit(
            x_train,
            y_train,
            validation_data = (x_valid, y_valid),
            batch_size=128,
            epochs=1,
            verbose=2,
            callbacks=[LearningRateScheduler(
                lambda _: 1e-3 * (0.55 ** global_epoch))
            ]
        )
        val_preds = model.predict(x_valid)
        AUC = 0
        for i in range(6):
            AUC += roc_auc_score(y_valid[:,i], val_preds[:,i])/6.
        print(AUC)

pred += model.predict(x_test, batch_size = 1024, verbose = 1)/SEEDS

In [None]:
# We create the submission file
list_classes = ["toxic", "severe_toxic", "obscene",
"threat", "insult", "identity_hate"]
submission = pd.read_csv('../input/sample_submission.csv')
submission[list_classes] = pred
submission.to_csv('../submissions/submission.csv', index=False)
submission.head()
# This model scores 0.98644 on the Private Leaderboard, and 0.98653 on the public leaderboard

## Text classification with DistilBERT

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
MAX_LEN = 320
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

In [None]:
train_data = pd.read_csv('../input/train.csv.zip')
label_columns = ["toxic", "severe_toxic", "obscene",
    "threat", "insult", "identity_hate"]

train_data['labels'] = train_data[label_columns].apply(lambda x: list(x), axis=1)
train_data.drop(['id'], inplace=True, axis=1)
train_data.drop(label_columns, inplace=True, axis=1)
print(train_data.head())

In [None]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, new_data=False):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.comment_text
        self.new_data = new_data

        if not new_data:
            self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        out = {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids,
            dtype=torch.long),
        }
        if not self.new_data:
            out['targets'] = torch.tensor(self.targets[index],
                dtype=torch.float)
        return out

In [None]:
train_size = 1.0

train_df = train_data.sample(frac=train_size, random_state=123)
val_df = train_data.drop(train_df.index).reset_index(drop=True)
train_df = train_df.reset_index(drop=True)

print("Orig Dataset: {}".format(train_data.shape))
print("Training Dataset: {}".format(train_df.shape))
print("Validation Dataset: {}".format(val_df.shape))

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(
    'distilbert-base-uncased', truncation=True, do_lower_case=True)

training_set = MultiLabelDataset(train_df, tokenizer, MAX_LEN)
val_set = MultiLabelDataset(val_df, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 8}

val_params = {'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 8}

training_loader = DataLoader(training_set, **train_params)
#val_loader = DataLoader(val_set, **val_params)

In [None]:
class DistilBERTClass(torch.nn.Module):

    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(768, 6)
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.bert(input_ids=input_ids,
            attention_mask=attention_mask)
        hidden_state = output_1[0]
        out = hidden_state[:, 0] # [CLS] token representation
        out = self.classifier(out)
        return out

In [None]:
model = DistilBERTClass()
model.to(DEVICE)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()

    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(DEVICE, dtype=torch.long)
        mask = data['mask'].to(DEVICE, dtype=torch.long)
        token_type_ids = data[
            'token_type_ids'].to(DEVICE, dtype=torch.long)
        targets = data['targets'].to(DEVICE, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = torch.nn.functional.binary_cross_entropy_with_logits(
            outputs, targets)
        
        if _ % 5000 == 0:
            print(f'Epoch: {epoch}, Loss: {loss.item()}')

        loss.backward()
        optimizer.step()

for epoch in range(EPOCHS):
    train(epoch)

In [None]:
test_data = pd.read_csv('../input/test.csv.zip')
print(test_data.head())

test_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN, new_data=True)
test_loader = DataLoader(test_set, **val_params)

all_test_pred = []

In [None]:
def test(epoch):
    model.eval()

    with torch.inference_mode():
        for _, data in tqdm(enumerate(test_loader, 0)):
        ids = data['ids'].to(DEVICE, dtype=torch.long)
        mask = data['mask'].to(DEVICE, dtype=torch.long)
        token_type_ids = data[
            'token_type_ids'].to(DEVICE, dtype=torch.long)
        outputs = model(ids, mask, token_type_ids)
        probas = torch.sigmoid(outputs)

        all_test_pred.append(probas)

    return probas

probas = test(model)

In [None]:
all_test_pred = torch.cat(all_test_pred)

submit_df = test_data.copy()
submit_df.drop("comment_text", inplace=True, axis=1)

label_columns = ["toxic", "severe_toxic", "obscene",
    "threat", "insult", "identity_hate"]

for i,name in enumerate(label_columns):
    submit_df[name] = all_test_pred[:, i].cpu()

submit_df.to_csv('../submissions/distilbert_0.csv', index=False)

submit_df.head()

## Text classification with AutoTrain

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
HF_USERNAME = "tunguz"
HF_TOKEN = user_secrets.get_secret("HUGGING_FACE_HUB_TOKEN")

from autotrain.params import TextClassificationParams
from autotrain.project import AutoTrainProject

import torch
from sklearn import model_selection, metrics
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer, TrainingArguments, Trainer
)

In [None]:
train = pd.read_csv(‘../input/toxic-train/train.csv’)
test = pd.read_csv('../input/toxic-train/test.csv')
print(train.head())

In [None]:
params = TextClassificationParams(
    model="google-bert/bert-base-uncased",
    data_path="../input/toxic-train/",
    text_column="comment_text",
    target_column="toxic",
    train_split="train",
    valid_split=None,
    epochs=3,
    batch_size=8,
    max_seq_length=512,
    lr=1e-5,
    optimizer="adamw_torch",
    scheduler="linear",
    gradient_accumulation=1,
    mixed_precision="fp16",
    project_name="autotrain-model",
    log="tensorboard",
    push_to_hub=False,
    username=HF_USERNAME,
    token=HF_TOKEN,
)

In [None]:
project = AutoTrainProject(params=params, backend="local", process=True)
project.create()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "../input/toxic-autotrain-toxic/autotrain-model", use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    "../input/toxic-autotrain-toxic/autotrain-model")

In [None]:
test.loc[:, "toxic"] = 0
test.loc[:, "severe_toxic"] = 0
test.loc[:, "obscene"] = 0
test.loc[:, "threat"] = 0
test.loc[:, "insult"] = 0
test.loc[:, "identity_hate"] = 0
print(test.head())

In [None]:
class ClassificationDataset:

    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        text = str(self.data["comment_text"].values[item])
        target = int(self.data["toxic"].values[item])
        inputs = self.tokenizer(
            text,
            max_length=512,
            padding="max_length",
            truncation=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(target, dtype=torch.long),
        }

In [None]:
dataset = ClassificationDataset(test, tokenizer)
trainer = Trainer(model)
preds = trainer.predict(dataset).predictions
# Preds will be in the form of logits,
# and need to be converted into probabilities before submission.

## OpenAI embeddings

In [None]:
import pandas as pd
import numpy as np
import os
import openai
from openai import OpenAI

client = OpenAI()

In [None]:
def get_embedding(text, model="text-embedding-3-large"):
    text = text.replace("\n", " ")
    return client.embeddings.create(
    input = [text], model=model
    ).data[0].embedding

In [None]:
train = pd.read_csv('../input/train.csv.zip').fillna(' ')[['comment_text']]
test = pd.read_csv('../input/test.csv.zip').fillna(' ')[['comment_text']]

In [None]:
test.at[9932, 'comment_text'] = '*'
test.at[55331, 'comment_text'] = '*'
test.at[97708, 'comment_text'] = '*'

In [None]:
train['embedding_3_large'] = train.comment_text.apply(
    lambda x: get_embedding(x, model='text-embedding-3-large'))

test['embedding_3_large'] = test.comment_text.apply(
    lambda x: get_embedding(x, model='text-embedding-3-large'))

In [None]:
train_embeds = np.array(
    [np.array(i) for i in train.embedding_3_large.values])

test_embeds = np.array(
    [np.array(i) for i in test.embedding_3_large.values])

In [None]:
np.save('../input/test_embs_3_large', test_embeds)
np.save('../input/train_embs_3_large', train_embeds)

## NVIDIA embeddings

In [None]:
from transformers import AutoModel
import pandas as pd
import numpy as np
model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True)

In [None]:
def get_embedding(text):
    text = text.replace("\n", " ")
    return model.encode([text], instruction=passage_prefix,
        max_length=max_length)[0]

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

class_names = ['toxic', 'severe_toxic', 'obscene',
    'threat', 'insult', 'identity_hate']
target = pd.read_csv('../input/train.csv.zip').fillna(' ')[class_names].values
train_features = np.load('../input/train_embs_NV_2.npy')
test_features = np.load('../input/test_embs_NV_2.npy')
sample_submission = pd.read_csv('../input/sample_submission.csv.zip')

In [None]:
preds = np.zeros((test_features.shape[0], target.shape[1]))

In [None]:
Cs = [4, 1, 4, 3, 2, 2]

In [None]:
errors = []
train_oof = np.zeros(target.shape)
kf = KFold(n_splits=5, random_state=137, shuffle=True)

In [None]:
for ii in range(6):
    print("Fitting target", ii+1)
    for jj, (
        train_index, val_index) in enumerate(kf.split(train_features)
        ):
        print("Fitting fold", jj+1)
        train_x = train_features[train_index]
        val_x = train_features[val_index]
        train_target = target[train_index, ii]
        classifier = LogisticRegression(C=Cs[ii], solver='sag', max_iter=10)
        classifier.fit(train_x, train_target)

        train_oof[val_index, ii] = classifier.predict_proba(val_x)[:,1]
        preds[:, ii] += classifier.predict_proba(test_features)[:,1]/5
        train_target = target[train_index, ii]

        print(roc_auc_score(target[:,ii], train_oof[:,ii]))
        errors.append(roc_auc_score(target[:,ii], train_oof[:,ii]))

In [None]:
sample_submission[class_names] = preds
sample_submission.to_csv('../input/NV_2_LR.csv', index=False)