# Preface

# Imports & Utility functions

In [None]:
import numpy as np
import pandas as pd
import os
import time
import gc
import random
from tqdm._tqdm_notebook import tqdm_notebook as tqdm
from keras.preprocessing import text, sequence
import torch
from torch import nn
from torch.utils import data
from torch.nn import functional as F

Using TensorFlow backend.


In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_EMBEDDING_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'
NUM_MODELS = 2
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 4 * LSTM_UNITS
MAX_LEN = 220

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []

    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def train_model(model, train, test, loss_fn, output_dim, lr=0.001,
                batch_size=512, n_epochs=4,
                enable_checkpoint_ensemble=True):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    all_test_preds = []
    checkpoint_weights = [2 ** epoch for epoch in range(n_epochs)]

    for epoch in range(n_epochs):
        start_time = time.time()

        scheduler.step()

        model.train()
        avg_loss = 0.

        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]

            y_pred = model(*x_batch)
            loss = loss_fn(y_pred, y_batch)

            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            avg_loss += loss.item() / len(train_loader)

        model.eval()
        test_preds = np.zeros((len(test), output_dim))

        for i, x_batch in enumerate(test_loader):
            y_pred = sigmoid(model(*x_batch).detach().cpu().numpy())

            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred

        all_test_preds.append(test_preds)
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t time={:.2f}s'.format(
              epoch + 1, n_epochs, avg_loss, elapsed_time))

    if enable_checkpoint_ensemble:
        test_preds = np.average(all_test_preds, weights=checkpoint_weights, axis=0)
    else:
        test_preds = all_test_preds[-1]

    return test_preds

In [None]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)    # (N, T, 1, K)
        x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
        x = super(SpatialDropout, self).forward(x)  # (N, K, 1, T), some features are masked
        x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = x.squeeze(2)  # (N, T, K)
        return x

class NeuralNet(nn.Module):
    def __init__(self, embedding_matrix, num_aux_targets):
        super(NeuralNet, self).__init__()
        embed_size = embedding_matrix.shape[1]

        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = SpatialDropout(0.3)

        self.lstm1 = nn.LSTM(embed_size, LSTM_UNITS, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(LSTM_UNITS * 2, LSTM_UNITS, bidirectional=True, batch_first=True)

        self.linear1 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)
        self.linear2 = nn.Linear(DENSE_HIDDEN_UNITS, DENSE_HIDDEN_UNITS)

        self.linear_out = nn.Linear(DENSE_HIDDEN_UNITS, 1)
        self.linear_aux_out = nn.Linear(DENSE_HIDDEN_UNITS, num_aux_targets)

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.embedding_dropout(h_embedding)

        h_lstm1, _ = self.lstm1(h_embedding)
        h_lstm2, _ = self.lstm2(h_lstm1)

        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)

        h_conc = torch.cat((max_pool, avg_pool), 1)
        h_conc_linear1  = F.relu(self.linear1(h_conc))
        h_conc_linear2  = F.relu(self.linear2(h_conc))

        hidden = h_conc + h_conc_linear1 + h_conc_linear2

        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 1)

        return out

In [None]:
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = data.astype(str).apply(lambda x: clean_special_chars(x, punct))
    return data

# Preprocessing

In [None]:
import pandas as pd
train = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test = pd.read_csv('../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

x_train = preprocess(train['comment_text'])
y_train = np.where(train['target'] >= 0.5, 1, 0)
y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
x_test = preprocess(test['comment_text'])

In [None]:
max_features = None

In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(list(x_train) + list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=MAX_LEN)
x_test = sequence.pad_sequences(x_test, maxlen=MAX_LEN)

In [None]:
max_features = max_features or len(tokenizer.word_index) + 1
max_features

327009

In [None]:
crawl_matrix, unknown_words_crawl = build_matrix(tokenizer.word_index, CRAWL_EMBEDDING_PATH)
print('n unknown words (crawl): ', len(unknown_words_crawl))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


n unknown words (crawl):  173678


In [None]:
glove_matrix, unknown_words_glove = build_matrix(tokenizer.word_index, GLOVE_EMBEDDING_PATH)
print('n unknown words (glove): ', len(unknown_words_glove))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


n unknown words (glove):  170383


In [None]:
embedding_matrix = np.concatenate([crawl_matrix, glove_matrix], axis=-1)
embedding_matrix.shape

del crawl_matrix
del glove_matrix
gc.collect()

0

In [None]:
x_train_torch = torch.tensor(x_train, dtype=torch.long).cuda()
x_test_torch = torch.tensor(x_test, dtype=torch.long).cuda()
y_train_torch = torch.tensor(np.hstack([y_train[:, np.newaxis], y_aux_train]), dtype=torch.float32).cuda()

# Training

In [None]:
train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
test_dataset = data.TensorDataset(x_test_torch)

all_test_preds = []

for model_idx in range(NUM_MODELS):
    print('Model ', model_idx)
    seed_everything(1234 + model_idx)

    model = NeuralNet(embedding_matrix, y_aux_train.shape[-1])
    model.cuda()

    test_preds = train_model(model, train_dataset, test_dataset, output_dim=y_train_torch.shape[-1],
                             loss_fn=nn.BCEWithLogitsLoss(reduction='mean'))
    all_test_preds.append(test_preds)
    print()

Model  0


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 1/4 	 loss=0.1094 	 time=1032.40s


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 2/4 	 loss=0.1034 	 time=1037.33s


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 3/4 	 loss=0.1019 	 time=1037.52s


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 4/4 	 loss=0.1010 	 time=1037.62s

Model  1


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 1/4 	 loss=0.1096 	 time=1035.39s


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))


Epoch 2/4 	 loss=0.1035 	 time=1036.52s


HBox(children=(IntProgress(value=0, max=3526), HTML(value='')))

KeyboardInterrupt: 

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, data_loader, tokenizer, loss_fn=None):

    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            x_batch = batch[:-1]  # Extract inputs
            y_batch = batch[-1]  # Extract labels

            # Forward pass to get predictions
            y_pred = model(*x_batch)
            y_pred = sigmoid(y_pred.detach().cpu().numpy())  # Apply sigmoid to get probabilities
            y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions

            all_preds.append(y_pred_binary[:, 0])  # Append predictions
            all_labels.append(y_batch[:, 0].cpu().numpy())  # Append true labels

    # Concatenate all predictions and labels
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    # Compute evaluation metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    metrics = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

    # Print metrics
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

    return metrics


test_loader = torch.utils.data.DataLoader(
    data.TensorDataset(x_test_torch, y_train_torch[:, 0]),
    batch_size=512,
    shuffle=False
)

metrics = evaluate_model(model, test_loader, tokenizer)


Evaluation Metrics:
Accuracy: 0.89
Precision: 0.86
Recall: 0.84
F1-Score: 0.85


In [None]:
Ndef save_model(model, file_name='/kaggle/working/model.pth'):
    torch.save(model.state_dict(), file_name)
save_model(model)

In [None]:
# After training each model, save it to /kaggle/working/
for model_idx in range(NUM_MODELS):`
    model_path = f"/kaggle/working/model_{model_idx}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model {model_idx} saved to {model_path}")


In [None]:
def load_model(model_path, embedding_matrix, num_aux_targets):
    model = NeuralNet(embedding_matrix, num_aux_targets)
    model.load_state_dict(torch.load(model_path))
    model.eval()  # Set the model to evaluation mode
    return model

# Load a specific model
model_idx = 0  # Load model_0 as an example
model_path = f"/kaggle/input/modelb/pytorch/default/1/model.pth"
model = load_model(model_path, embedding_matrix, y_aux_train.shape[-1])
model.cuda()  # Move model to GPU if available

# Function to preprocess and predict a sentence
from keras.preprocessing.sequence import pad_sequences  # Ensure this import is present

def predict_sentence(sentence, model, tokenizer, max_len=MAX_LEN):
    # Preprocess the sentence
    sentence = preprocess(pd.Series([sentence]))
    sequence = tokenizer.texts_to_sequences(sentence)

    # Pad the sequence correctly using keras' pad_sequences
    sequence = pad_sequences(sequence, maxlen=max_len)  # Correct usage here
    sequence = torch.tensor(sequence, dtype=torch.long).cuda()  # Convert to tensor and move to GPU

    # Run the model to get the predictions
    with torch.no_grad():
        preds = model(sequence)
        preds = sigmoid(preds.cpu().numpy())

    return preds



# Example sentence for prediction
def label_predictions(prediction):
    # Extract individual values for readability
    toxicity = prediction[0][0]
    severe_toxicity = prediction[0][1]
    obscene = prediction[0][2]
    identity_attack = prediction[0][3]
    insult = prediction[0][4]
    threat = prediction[0][5]

    # Print each prediction with labels
    print(f"Toxicity: {toxicity}")
    print(f"Severe Toxicity: {severe_toxicity}")
    print(f"Obscene: {obscene}")
    print(f"Identity Attack: {identity_attack}")
    print(f"Insult: {insult}")
    print(f"Threat: {threat}")

# Example sentence for prediction
sentence = "Vishav is a piece of shit."
prediction = predict_sentence(sentence, model, tokenizer)
label_predictions(prediction)

Toxicity: 0.9995326995849609
Severe Toxicity: 0.9278066754341125
Obscene: 0.152074933052063
Identity Attack: 0.8808927536010742
Insult: 0.024855386465787888
Threat: 0.7616588473320007


In [None]:
import pickle

# Save the tokenizer to a file
with open('/kaggle/working/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Tokenizer saved as 'tokenizer.pkl'")


Tokenizer saved as 'tokenizer.pkl'


In [None]:
# Save the embedding matrix to a file (e.g., as a .npy file)
np.save('/kaggle/working/embedding_matrix.npy', embedding_matrix)

print("Embedding matrix saved successfully!")


Embedding matrix saved successfully!


In [None]:
# Load the embedding matrix from the saved .npy file
import numpy as np
embedding_matrix = np.load('/kaggle/input/emd/other/default/1/embedding_matrix.npy')

print("Embedding matrix loaded successfully!")


Embedding matrix loaded successfully!
