In [1]:
import torch
import torch.nn as nn

class ToxicWordClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, dropout_rate=0.2):
        super(ToxicWordClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc = nn.Linear(embedding_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)  # Apply dropout to the embedded input
        output = self.fc(embedded)
        output = self.sigmoid(output)
        return output

In [2]:
# Load toxic and non-toxic words from external text files
toxic_words = [line.strip() for line in open('/Users/damirabdulaev/Downloads/toxic_words.txt', 'r', encoding='utf-8')]
non_toxic_words = [line.strip() for line in open('/Users/damirabdulaev/Downloads/positive-words.txt', 'r', encoding='utf-8')]
all_words = toxic_words + non_toxic_words
labels = [1] * len(toxic_words) + [0] * len(non_toxic_words)

In [3]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize and convert your sentences to model input
sentences = all_words  # Replace with your list of sentences

# Tokenize and convert sentences to input indices
input_ids = []
attention_masks = []

for sentence in sentences:
    # Tokenize the sentence and add special tokens
    encoded_dict = tokenizer(
        sentence,
        add_special_tokens=False,
        truncation=True,
        max_length=1,
        padding='max_length',
        return_tensors='pt'
    )

    # Extract the input IDs and attention mask
    input_ids.append(encoded_dict['input_ids'])

# Convert the lists of tensors to a single tensor
word_indices = torch.cat(input_ids, dim=0)

In [4]:
print(word_indices)

tensor([[ 1018],
        [ 1019],
        [ 1019],
        ...,
        [28672],
        [27838],
        [14101]])


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Create a PyTorch model
vocab_size = len(tokenizer.vocab)  # Assuming you've defined 'vocabulary'
embedding_dim = 100  # Adjust as needed
output_dim = 1  # Assuming binary classification

model = ToxicWordClassifier(vocab_size, embedding_dim, output_dim)

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float, requires_grad=True)

# Training loop
num_epochs = 8  # Specify the number of training epochs

for epoch in range(num_epochs):
    total_loss = 0.0
    total_correct = 0
    total_samples = len(labels)

    # Wrap your training data with tqdm for the progress bar
    for indices, label in tqdm(zip(word_indices, labels), total=len(labels), desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        inputs = torch.tensor(indices, dtype=torch.long)

        # Forward pass
        outputs = model(inputs)[0][0]

        # Calculate the loss
        loss = criterion(outputs, label)

        # Backpropagation and optimization
        loss.backward()
        optimizer.step()

        predicted = (outputs > 0.5).float()

        # Compute accuracy
        correct = (predicted == label).float()
        total_correct += correct.sum().item()
        total_loss += loss.item()

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / total_samples
    accuracy = (total_correct / total_samples) * 100.0

    print(f"Epoch [{epoch + 1}/{num_epochs}] - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%")

print("Training complete")

  inputs = torch.tensor(indices, dtype=torch.long)
Epoch 1: 100%|██████████| 7240/7240 [01:41<00:00, 71.65it/s]


Epoch [1/8] - Loss: 0.6094, Accuracy: 72.57%


Epoch 2: 100%|██████████| 7240/7240 [01:41<00:00, 71.57it/s]


Epoch [2/8] - Loss: 0.5424, Accuracy: 76.71%


Epoch 3: 100%|██████████| 7240/7240 [01:41<00:00, 71.45it/s]


Epoch [3/8] - Loss: 0.4896, Accuracy: 78.91%


Epoch 4: 100%|██████████| 7240/7240 [01:40<00:00, 72.24it/s]


Epoch [4/8] - Loss: 0.4300, Accuracy: 81.51%


Epoch 5: 100%|██████████| 7240/7240 [01:41<00:00, 71.57it/s]


Epoch [5/8] - Loss: 0.3828, Accuracy: 83.96%


Epoch 6: 100%|██████████| 7240/7240 [01:39<00:00, 72.82it/s]


Epoch [6/8] - Loss: 0.3378, Accuracy: 86.33%


Epoch 7: 100%|██████████| 7240/7240 [01:39<00:00, 72.91it/s]


Epoch [7/8] - Loss: 0.2987, Accuracy: 88.08%


Epoch 8: 100%|██████████| 7240/7240 [01:40<00:00, 72.07it/s]

Epoch [8/8] - Loss: 0.2710, Accuracy: 89.49%
Training complete





In [6]:
torch.save(model.state_dict(), 'twc.pth')

In [7]:
import pandas as pd

dataset_path = '/Users/damirabdulaev/Downloads/filtered.tsv'
df = pd.read_csv(dataset_path, sep='\t')
df.head(5)

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [8]:
import nltk
from nltk.corpus import wordnet

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/damirabdulaev/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
import string
# Extract and process the first 5 sentences
sentences = df['reference'].tolist()

# Define a threshold for toxic word prediction (adjust as needed)
toxic_threshold = 0.7
temp = 0

# Define a set of punctuation marks
punctuation_set = set(string.punctuation)

model.eval()

# Process and replace toxic words
for sentence in tqdm(sentences):
    encoded_dict = tokenizer(sentence)
    tokens = encoded_dict['input_ids']
    masked_sentence = []

    # Initialize a flag to exclude special tokens
    exclude_special_tokens = True

    for token in tokens:
        # Check if the token is a special token (CLS or SEP)
        is_special_token = token in (tokenizer.cls_token_id, tokenizer.sep_token_id)

        if exclude_special_tokens and is_special_token:
            continue  # Skip special tokens
        else:
            # Convert the token to an index using your vocabulary mapping
            inputs = torch.tensor([token], dtype=torch.long)
            with torch.no_grad():
                outputs = model(inputs)
                predicted_prob = outputs.item()
                # Check if the token is a punctuation mark
                is_punctuation = tokenizer.convert_ids_to_tokens(token) in punctuation_set
                if predicted_prob > toxic_threshold and not is_punctuation:
                    masked_sentence.append(103) # mask token
                else:
                    masked_sentence.append(token)

    # Use tokenizer.decode to reconstruct the sentence
    reconstructed_sentence = tokenizer.decode(masked_sentence)

    # Print the original sentence and the reconstructed sentence
    print("Original Sentence:")
    print(sentence)
    print("Reconstructed Sentence:")
    print(reconstructed_sentence)

    temp += 1
    if temp > 10:
        break

  0%|          | 10/577777 [00:00<11:40, 824.22it/s]

Original Sentence:
If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.
Reconstructed Sentence:
if al [MASK] is flooding her with psychic [MASK], [MASK] [MASK] the high level of [MASK] [MASK]transmitters.
Original Sentence:
Now you're getting nasty.
Reconstructed Sentence:
now you're [MASK] [MASK].
Original Sentence:
Well, we could spare your life, for one.
Reconstructed Sentence:
well, [MASK] could spare your life, [MASK] [MASK].
Original Sentence:
Ah! Monkey, you've got to snap out of it.
Reconstructed Sentence:
ah! [MASK], you've got [MASK] snap out of [MASK].
Original Sentence:
I've got orders to put her down.
Reconstructed Sentence:
i've got orders [MASK] put her [MASK].
Original Sentence:
I'm not gonna have a child... ...with the same genetic disorder as me who's gonna die. L...
Reconstructed Sentence:
i'[MASK] [MASK] gonna have a child...... with the same genetic [MASK] as me who'[MASK] gonna die. [MASK]...
Original Sentence:
They're al




In [19]:
def get_non_toxic_synonym(word):
    synonyms = wordnet.synsets(word)
    if synonyms:
        valid_synonyms = [synonym.lemmas()[0].name() for synonym in synonyms if synonym.lemmas()[0].name() != word and synonym.lemmas()[0].name() != '[UNK]']
        if valid_synonyms:
            return valid_synonyms[0]
    return word  # If no valid synonyms found, return the original word

In [22]:
import string
import nltk
from nltk.corpus import wordnet

# Extract and process the first 5 sentences
sentences = df['reference'].tolist()
recon = []

# Define a threshold for toxic word prediction (adjust as needed)
toxic_threshold = 0.7
temp = 0

# Define a set of punctuation marks
punctuation_set = set(string.punctuation)

model.eval()

# Process and replace toxic words with synonyms
for sentence in tqdm(sentences):
    encoded_dict = tokenizer(sentence)
    tokens = encoded_dict['input_ids']
    replaced_sentence = []

    # Initialize a flag to exclude special tokens
    exclude_special_tokens = True

    for token in tokens:
        # Check if the token is a special token (CLS or SEP)
        is_special_token = token in (tokenizer.cls_token_id, tokenizer.sep_token_id)

        if exclude_special_tokens and is_special_token:
            continue  # Skip special tokens
        else:
            # Convert the token to an index using your vocabulary mapping
            inputs = torch.tensor([token], dtype=torch.long)
            with torch.no_grad():
                outputs = model(inputs)
                predicted_prob = outputs.item()
                # Check if the token is a punctuation mark
                is_punctuation = tokenizer.convert_ids_to_tokens(token) in punctuation_set
                if predicted_prob > toxic_threshold and not is_punctuation:
                    # Replace toxic word with a non-toxic synonym
                    word = tokenizer.convert_ids_to_tokens(token)
                    non_toxic_synonym = get_non_toxic_synonym(word)
                    if non_toxic_synonym in tokenizer.vocab:
                        replaced_sentence.append(tokenizer.convert_tokens_to_ids(non_toxic_synonym))
                    else:
                        replaced_sentence.append(token)
                else:
                    replaced_sentence.append(token)

    # Use tokenizer.decode to reconstruct the sentence
    reconstructed_sentence = tokenizer.decode(replaced_sentence)

    # Print the original sentence and the reconstructed sentenc
    recon.append((sentence, reconstructed_sentence))

100%|██████████| 577777/577777 [04:10<00:00, 2309.40it/s]


In [23]:
for i, rec in enumerate(recon):
    print("Original sentence:", rec[0])
    print("Non-toxic sentence:", rec[1])
    if i == 10:
        break

Original sentence: If Alkar is flooding her with psychic waste, that explains the high level of neurotransmitters.
Non-toxic sentence: if alkar is flooding her with psychic waste, that explain the high level of neonurotransmitters.
Original sentence: Now you're getting nasty.
Non-toxic sentence: now you're acquiring nasty.
Original sentence: Well, we could spare your life, for one.
Non-toxic sentence: well, we could spare your life, for one.
Original sentence: Ah! Monkey, you've got to snap out of it.
Non-toxic sentence: ah! imp, you've got to snap out of it.
Original sentence: I've got orders to put her down.
Non-toxic sentence: i've got orders to put her down.
Original sentence: I'm not gonna have a child... ...with the same genetic disorder as me who's gonna die. L...
Non-toxic sentence: i'meter not gonna have a child...... with the same genetic disorder as me who'second gonna die. liter...
Original sentence: They're all laughing at us, so we'll kick your ass.
Non-toxic sentence: th

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Extract the pairs of sentences from the tuples
original = [sentence[0] for sentence in recon[:10000]]
detox = [sentence[1] for sentence in recon[:10000]]

# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Combine the sentences for each array
original_sentences = [' '.join(sentence.split()) for sentence in original]
detox_sentences = [' '.join(sentence.split()) for sentence in detox]

# Fit and transform the sentences to TF-IDF vectors
original_tfidf = tfidf_vectorizer.fit_transform(original_sentences)
detox_tfidf = tfidf_vectorizer.transform(detox_sentences)

# Calculate cosine similarity for the corresponding sentences
cosine_similarities = cosine_similarity(original_tfidf, detox_tfidf)

cosine_similarity_sum = 0
for i, sim in enumerate(cosine_similarities):
    cosine_similarity_sum += sim[i]

print("Mean cosine similarity of the original and detox sentences:", cosine_similarity_sum / len(cosine_similarities))

Mean cosine similarity of the original and detox sentences: 0.821828220463988
