In [1]:
import torch
import torch.nn as nn

class ToxicWordClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, dropout_rate=0.2):
        super(ToxicWordClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)  # Apply dropout to the embedded input
        output = self.fc(embedded)
        output = self.sigmoid(output)
        return output

In [2]:
# Load toxic and non-toxic words from external text files
toxic_words = [line.strip() for line in open('/Users/damirabdulaev/Downloads/toxic_words.txt', 'r', encoding='utf-8')]
non_toxic_words = [line.strip() for line in open('/Users/damirabdulaev/Downloads/positive-words.txt', 'r', encoding='utf-8')]
all_words = toxic_words + non_toxic_words
labels = [1] * len(toxic_words) + [0] * len(non_toxic_words)

In [3]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and convert your sentences to model input
sentences = all_words  # Replace with your list of sentences

# Tokenize and convert sentences to input indices
input_ids = []
attention_masks = []

for sentence in sentences:
    # Tokenize the sentence and add special tokens
    encoded_dict = tokenizer(
        sentence,
        add_special_tokens=False,
        truncation=True,
        max_length=1,
        padding='max_length',
        return_tensors='pt'
    )

    # Extract the input IDs and attention mask
    input_ids.append(encoded_dict['input_ids'])

# Convert the lists of tensors to a single tensor
word_indices = torch.cat(input_ids, dim=0)

In [4]:
print(word_indices)

tensor([[ 1018],
        [ 1019],
        [ 1019],
        ...,
        [28672],
        [27838],
        [14101]])


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Create a PyTorch model
vocab_size = len(tokenizer.vocab)  # Assuming you've defined 'vocabulary'
embedding_dim = 100  # Adjust as needed
output_dim = 1  # Assuming binary classification

model = ToxicWordClassifier(vocab_size, embedding_dim, output_dim)

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float, requires_grad=True)

# Training loop
num_epochs = 10  # Specify the number of training epochs

for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    total_samples = len(labels)

    # Wrap your training data with tqdm for the progress bar
    for indices, label in tqdm(zip(word_indices, labels), total=len(labels), desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        inputs = torch.tensor(indices, dtype=torch.long)

        # Forward pass
        outputs = model(inputs)[0][0]

        # Calculate the loss
        loss = criterion(outputs, label)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        predicted = (outputs > 0.5).float()

        # Compute accuracy
        correct = (predicted == label).float()
        total_correct += correct.sum().item()
        total_loss += loss.item()

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / total_samples
    accuracy = (total_correct / total_samples) * 100.0

    print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

print("Training complete")

  inputs = torch.tensor(indices, dtype=torch.long)
Epoch 1:   1%|          | 83/7240 [00:01<01:43, 69.04it/s]


KeyboardInterrupt: 

In [7]:
torch.save(model.state_dict(), 'twc.pth')

In [7]:
# Initialize the model
model = ToxicWordClassifier(vocab_size, embedding_dim, output_dim)

# Load the saved model state_dict
model.load_state_dict(torch.load('twc.pth'))

# Put the model in evaluation mode
model.eval()

ToxicWordClassifier(
  (embedding): Embedding(30522, 100)
  (fc): Linear(in_features=100, out_features=1, bias=True)
  (sigmoid): Sigmoid()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [8]:
import pandas as pd

dataset_path = '/Users/damirabdulaev/Downloads/filtered.tsv'
df = pd.read_csv(dataset_path, sep='\t')
df.head(5)

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [9]:
df = df[(df['similarity'] > 0.8) & (df['ref_tox'] > df['trn_tox'])]
df.head(5)

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
22,22,"Real life starts the first time you fuck, kid.","boy, real life starts up first.",0.866697,0.319149,0.998222,0.000114
29,29,"Hey, leave the poor bastard alone!",leave the poor man alone!,0.857554,0.257143,0.999382,0.000578
41,41,It told you this was a waste of my fucking time.,I told you this was a waste of my time.,0.904062,0.183673,0.995877,0.000479
43,43,"I swear to God, the best thing I ever did in m...","I swear to God, the best thing I've ever done ...",0.932305,0.022472,0.999071,0.0009
48,48,The guy is totally irresponsible.,he's completely irresponsible.,0.862266,0.088235,0.64368,0.019941


In [10]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/damirabdulaev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
import string
# Extract and process the first 5 sentences
sentences = df['reference'].tolist()

# Define a threshold for toxic word prediction (adjust as needed)
toxic_threshold = 0.7
temp = 0

# Define a set of punctuation marks
punctuation_set = set(string.punctuation)

model.eval()

# Process and replace toxic words
for sentence in tqdm(sentences):
    encoded_dict = tokenizer(sentence)
    tokens = encoded_dict['input_ids']
    masked_sentence = []

    # Initialize a flag to exclude special tokens
    exclude_special_tokens = True

    for token in tokens:
        # Check if the token is a special token (CLS or SEP)
        is_special_token = token in (tokenizer.cls_token_id, tokenizer.sep_token_id)

        if exclude_special_tokens and is_special_token:
            continue  # Skip special tokens
        else:
            # Convert the token to an index using your vocabulary mapping
            inputs = torch.tensor([token], dtype=torch.long)
            with torch.no_grad():
                outputs = model(inputs)
                predicted_prob = outputs.item()
                # Check if the token is a punctuation mark
                is_punctuation = tokenizer.convert_ids_to_tokens(token) in punctuation_set
                if predicted_prob > toxic_threshold and not is_punctuation:
                    masked_sentence.append(103) # mask token
                else:
                    masked_sentence.append(token)

    # Use tokenizer.decode to reconstruct the sentence
    reconstructed_sentence = tokenizer.decode(masked_sentence)

    # Print the original sentence and the reconstructed sentence
    print("Original Sentence:")
    print(sentence)
    print("Reconstructed Sentence:")
    print(reconstructed_sentence)

    temp += 1
    if temp > 10:
        break

  0%|          | 10/108569 [00:00<00:55, 1971.93it/s]

Original Sentence:
Real life starts the first time you fuck, kid.
Reconstructed Sentence:
real life starts the first time you [MASK], kid.
Original Sentence:
Hey, leave the poor bastard alone!
Reconstructed Sentence:
hey, [MASK] the [MASK] [MASK] alone!
Original Sentence:
It told you this was a waste of my fucking time.
Reconstructed Sentence:
[MASK] told you this [MASK] a [MASK] [MASK] [MASK] [MASK] time.
Original Sentence:
I swear to God, the best thing I ever did in my life was save that little son of a bitch
Reconstructed Sentence:
i [MASK] to god, the best thing i ever [MASK] in [MASK] life [MASK] save that [MASK] [MASK] [MASK] a [MASK]
Original Sentence:
The guy is totally irresponsible.
Reconstructed Sentence:
the [MASK] is totally irrespon [MASK].
Original Sentence:
"Do you want to die?" he said.
Reconstructed Sentence:
" [MASK] you [MASK] to [MASK]? " [MASK] said.
Original Sentence:
Does anal...
Reconstructed Sentence:
does [MASK]...
Original Sentence:
Your girlfriends are dea




In [12]:
def get_non_toxic_synonym(word):
    synonyms = wordnet.synsets(word)
    if synonyms:
        valid_synonyms = [synonym.lemmas()[0].name() for synonym in synonyms if synonym.lemmas()[0].name() != word and synonym.lemmas()[0].name() != '[UNK]']
        if valid_synonyms:
            return valid_synonyms[0]
    return word  # If no valid synonyms found, return the original word

In [13]:
import string
import nltk
from nltk.corpus import wordnet

# Extract and process the first 5 sentences
sentences = df['reference'].tolist()
recon = []

# Define a threshold for toxic word prediction (adjust as needed)
toxic_threshold = 0.7
temp = 0

# Define a set of punctuation marks
punctuation_set = set(string.punctuation)

model.eval()

# Process and replace toxic words with synonyms
for sentence in tqdm(sentences):
    encoded_dict = tokenizer(sentence)
    tokens = encoded_dict['input_ids']
    replaced_sentence = []

    # Initialize a flag to exclude special tokens
    exclude_special_tokens = True

    for token in tokens:
        # Check if the token is a special token (CLS or SEP)
        is_special_token = token in (tokenizer.cls_token_id, tokenizer.sep_token_id)

        if exclude_special_tokens and is_special_token:
            continue  # Skip special tokens
        else:
            # Convert the token to an index using your vocabulary mapping
            inputs = torch.tensor([token], dtype=torch.long)
            with torch.no_grad():
                outputs = model(inputs)
                predicted_prob = outputs.item()
                # Check if the token is a punctuation mark
                is_punctuation = tokenizer.convert_ids_to_tokens(token) in punctuation_set
                if predicted_prob > toxic_threshold and not is_punctuation:
                    # Replace toxic word with a non-toxic synonym
                    word = tokenizer.convert_ids_to_tokens(token)
                    non_toxic_synonym = get_non_toxic_synonym(word)
                    if non_toxic_synonym in tokenizer.vocab:
                        for element in tokenizer(non_toxic_synonym)['input_ids'][1:-1]:
                            replaced_sentence.append(element)
                    else:
                        replaced_sentence.append(token)
                else:
                    replaced_sentence.append(token)

    # Use tokenizer.decode to reconstruct the sentence
    reconstructed_sentence = tokenizer.decode(replaced_sentence)

    # Print the original sentence and the reconstructed sentenc
    recon.append((sentence, reconstructed_sentence))

100%|██████████| 108569/108569 [01:04<00:00, 1684.42it/s]


In [14]:
import pickle
# Specify the filename for saving the list
file_name = 'recon.pkl'

# Save the list to a file using pickle
with open(file_name, 'wb') as file:
    pickle.dump(recon, file)

In [15]:
# Specify the filename from which to load the list
file_name = 'recon.pkl'

recon = []

# Load the list from the file using pickle
with open(file_name, 'rb') as file:
    recon = pickle.load(file)

In [16]:
for i, rec in enumerate(recon):
    print("Original sentence:", rec[0])
    print("Non-toxic sentence:", rec[1])
    if i == 10:
        break

Original sentence: Real life starts the first time you fuck, kid.
Non-toxic sentence: real life starts the first time you fuck, kid.
Original sentence: Hey, leave the poor bastard alone!
Non-toxic sentence: hey, farewell the poor asshole alone!
Original sentence: It told you this was a waste of my fucking time.
Non-toxic sentence: it told you this was a waste of my fuck time.
Original sentence: I swear to God, the best thing I ever did in my life was save that little son of a bitch
Non-toxic sentence: i curse to god, the best thing i ever make in my life was save that small son of a bitch
Original sentence: The guy is totally irresponsible.
Non-toxic sentence: the guy is totally irrespon # # sible.
Original sentence: "Do you want to die?" he said.
Non-toxic sentence: " bash you want to fail? " helium said.
Original sentence: Does anal...
Non-toxic sentence: does anal...
Original sentence: Your girlfriends are dead.
Non-toxic sentence: your girlfriends are dead.
Original sentence: Mikae

In [17]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/damirabdulaev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/damirabdulaev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract the pairs of sentences from the tuples
original = [sentence[0] for sentence in recon[:1000]]
detox = [sentence[1] for sentence in recon[:1000]]

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine the sentences for each array
original_sentences = [' '.join(sentence.split()) for sentence in original]
detox_sentences = [' '.join(sentence.split()) for sentence in detox]

# Fit and transform the sentences to TF-IDF vectors
original_tfidf = tfidf_vectorizer.fit_transform(original_sentences)
detox_tfidf = tfidf_vectorizer.transform(detox_sentences)

# Calculate cosine similarity for the corresponding sentences
cosine_similarities = cosine_similarity(original_tfidf, detox_tfidf)

cosine_similarity_sum = 0
for i, sim in enumerate(cosine_similarities):
    cosine_similarity_sum += sim[i]

print("Mean cosine similarity of the original and detox sentences:", cosine_similarity_sum / len(cosine_similarities))

Mean cosine similarity of the original and detox sentences: 0.8296287564356902


In [19]:
from J_metric import J

result = J(original, detox)  # Call the J function with appropriate arguments

Calculating style of predictions


Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1000/1000 [00:29<00:00, 33.99it/s]


0.34436827139987874
Calculate the semantic similarity
0.8177248357832432
Calculating CoLA acceptability stats


  0%|          | 0/1000 [00:00<?, ?it/s]

0.9271976281367242
| ACC | SIM | FL | J |

|0.3444|0.8177|0.9272|0.2611|

0.2610974503268504
