# Initial Steps
- Install Dependencies: 
- Make sure to install the necessary Python libraries:

In [1]:
pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install pandas scikit-learn nltk joblib matplotlib wordcloud numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tensorflow==2.18.0

Note: you may need to restart the kernel to use updated packages.


# Read CSV files

In [4]:
import pandas as pd

# Load the Jigsaw Toxic Comment Classification Dataset (Kaggle)
jigsaw_df = pd.read_csv('jigsaw_toxic_comment_dataset_train.csv')

# Load the Hate Speech and Offensive Language Dataset
hate_speech_df = pd.read_csv('hate_speech_and_offensive_language_labeled_data.csv')

print(jigsaw_df.head())
print(hate_speech_df.head())


                 id                                       comment_text  toxic  \
0  0000997932d777bf  Explanation\nWhy the edits made under my usern...      0   
1  000103f0d9cfb60f  D'aww! He matches this background colour I'm s...      0   
2  000113f07ec002fd  Hey man, I'm really not trying to edit war. It...      0   
3  0001b41b1c6bb37e  "\nMore\nI can't make any real suggestions on ...      0   
4  0001d958c54c6e35  You, sir, are my hero. Any chance you remember...      0   

   severe_toxic  obscene  threat  insult  identity_hate  
0             0        0       0       0              0  
1             0        0       0       0              0  
2             0        0       0       0              0  
3             0        0       0       0              0  
4             0        0       0       0              0  
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      

# Combine the datasets
- Hate Speech Dataset: Label "hate speech" and "offensive language" as inappropriate
- Jigsaw Dataset: Create "Inappropriate" label based on toxic, severe_toxic, obscene, threat, insult, identity_hate


In [5]:
# Combine the datasets
# Jigsaw Dataset: Create "Inappropriate" label based on toxic, severe_toxic, obscene, threat, insult, identity_hate
jigsaw_df['inappropriate'] = jigsaw_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)
jigsaw_df['inappropriate'] = jigsaw_df['inappropriate'].apply(lambda x: 1 if x > 0 else 0)
jigsaw_df = jigsaw_df[['comment_text', 'inappropriate']]

# Hate Speech Dataset: Label "hate speech" and "offensive language" as inappropriate
hate_speech_df['inappropriate'] = hate_speech_df['class'].apply(lambda x: 1 if x == 0 or x == 1 else 0)
hate_speech_df = hate_speech_df[['tweet', 'inappropriate']].rename(columns={'tweet': 'comment_text'})

# Combine both datasets
combined_df = pd.concat([jigsaw_df, hate_speech_df], ignore_index=True)


In [6]:
print(combined_df.head())
# Assuming your DataFrame is called 'df' and the class label column is 'label'
inappropriate_rows = combined_df[combined_df['inappropriate'] == 1]

# Print the filtered rows
print(inappropriate_rows)


                                        comment_text  inappropriate
0  Explanation\nWhy the edits made under my usern...              0
1  D'aww! He matches this background colour I'm s...              0
2  Hey man, I'm really not trying to edit war. It...              0
3  "\nMore\nI can't make any real suggestions on ...              0
4  You, sir, are my hero. Any chance you remember...              0
                                             comment_text  inappropriate
6            COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK              1
12      Hey... what is it..\n@ | talk .\nWhat is it......              1
16      Bye! \n\nDon't look, come or think of comming ...              1
42      You are gay or antisemmitian? \n\nArchangel WH...              1
43               FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!              1
...                                                   ...            ...
184347                                 you're all niggers              1
184348  

# Text Cleaning

Text cleaning is crucial to remove unwanted characters, symbols, and patterns that may not contribute to the model's performance. This step also reduces noise in the data.

1. Lowercasing: Convert all text to lowercase to avoid treating the same words differently based on capitalization.
2. Remove Special Characters: Remove punctuation, special symbols, and numbers as they may not add value for the model.
3. Remove URLs and Emails: Links and emails are often not useful and should be removed.
4. Remove Stopwords: Stopwords (e.g., "the", "is", "in") are common words that can be removed as they don't carry significant meaning.
5. Expand Contractions: Convert contractions like "don't" to "do not" to standardize the language.

In [7]:
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Dictionary of common contractions
contractions = {
    "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", 
    "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", 
    "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", 
    "i'd": "I would", "i'll": "I will", "i'm": "I am", "i've": "I have", "isn't": "is not", 
    "it's": "it is", "let's": "let us", "mightn't": "might not", "mustn't": "must not", 
    "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", 
    "shouldn't": "should not", "that's": "that is", "there's": "there is", "they'd": "they would", 
    "they'll": "they will", "they're": "they are", "they've": "they have", "we'd": "we would", 
    "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", 
    "what're": "what are", "what's": "what is", "what've": "what have", "where's": "where is", 
    "who'd": "who would", "who'll": "who will", "who're": "who are", "who's": "who is", 
    "who've": "who have", "won't": "will not", "wouldn't": "would not", "you'd": "you would", 
    "you'll": "you will", "you're": "you are", "you've": "you have"
}

# Function to expand contractions
def expand_contractions(text, contractions_dict):
    pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')
    return pattern.sub(lambda x: contractions_dict[x.group()], text)

def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = expand_contractions(text, contractions)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '', text)
    
    # Remove emojis
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"  # other symbols
                           u"\U000024C2-\U0001F251" 
                           "]+", flags=re.UNICODE)
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply the cleaning function to the 'comment_text' column
combined_df['cleaned_comment'] = combined_df['comment_text'].apply(clean_text)

# View the cleaned data
print(combined_df[['comment_text', 'cleaned_comment']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\srinivasu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                        comment_text  \
0  Explanation\nWhy the edits made under my usern...   
1  D'aww! He matches this background colour I'm s...   
2  Hey man, I'm really not trying to edit war. It...   
3  "\nMore\nI can't make any real suggestions on ...   
4  You, sir, are my hero. Any chance you remember...   

                                     cleaned_comment  
0  explanation edits made username hardcore metal...  
1  daww matches background colour I seemingly stu...  
2  hey man I really trying edit war guy constantl...  
3  cannot make real suggestions improvement wonde...  
4                      sir hero chance remember page  


# Text Normalization and Lemmatization
- Normalization standardizes the text data by reducing words to their base forms. 
- Lemmatization ensures that words are reduced to their dictionary form (e.g., "running" becomes "run").
- This helps improve the model’s generalization.

In [8]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# Apply lemmatization
combined_df['lemmatized_comment'] = combined_df['cleaned_comment'].apply(lemmatize_text)

# View the lemmatized data
print(combined_df[['cleaned_comment', 'lemmatized_comment', 'inappropriate']].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\srinivasu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                     cleaned_comment  \
0  explanation edits made username hardcore metal...   
1  daww matches background colour I seemingly stu...   
2  hey man I really trying edit war guy constantl...   
3  cannot make real suggestions improvement wonde...   
4                      sir hero chance remember page   

                                  lemmatized_comment  inappropriate  
0  explanation edits made username hardcore metal...              0  
1  daww match background colour I seemingly stuck...              0  
2  hey man I really trying edit war guy constantl...              0  
3  cannot make real suggestion improvement wonder...              0  
4                      sir hero chance remember page              0  


Using advanced embeddings and deep learning models are related but distinct approaches to improving text classification tasks. Here’s a breakdown of the differences and how they complement each other:

### Advanced Embeddings

**1. Word2Vec, GloVe, FastText:**
   - **Word2Vec**: Learns word representations by training on large text corpora to predict words in context. Provides dense, continuous vector representations of words.
   - **GloVe (Global Vectors for Word Representation)**: Learns word vectors by aggregating global word-word co-occurrence statistics from a corpus. Creates dense word vectors with semantic meaning.
   - **FastText**: An extension of Word2Vec that also considers sub-word information, making it more robust for morphologically rich languages.

**Purpose:**
- These embeddings transform words into dense vectors, capturing semantic relationships and context. They provide a richer representation of words compared to simple bag-of-words or TF-IDF models.

**Advantages:**
- **Contextual Similarity**: Words with similar meanings have similar vectors.
- **Dimensionality Reduction**: Dense embeddings are often lower-dimensional compared to TF-IDF matrices, which can help with model efficiency.

**How to Use:**
- You can replace TF-IDF features with pre-trained embeddings or train embeddings from scratch. For example, use pre-trained Word2Vec or GloVe embeddings to transform your text data into vectors.

### Deep Learning Models

**1. LSTM (Long Short-Term Memory):**
   - A type of Recurrent Neural Network (RNN) that can capture long-term dependencies and context in sequences, such as sentences.

**2. BiLSTM (Bidirectional LSTM):**
   - Extends LSTM by processing sequences in both forward and backward directions, capturing context from both past and future.

**3. Transformers (e.g., BERT, GPT):**
   - Transformers use self-attention mechanisms to capture dependencies between words in a sentence, regardless of their position.
   - **BERT (Bidirectional Encoder Representations from Transformers)**: Pre-trained model that understands context from both directions. Fine-tuned on your specific task for better performance.

**Purpose:**
- Deep learning models like LSTM and BiLSTM capture sequential dependencies and contextual information better than traditional methods.
- Transformers provide state-of-the-art performance by leveraging large pre-trained models that understand context deeply and effectively.

**Advantages:**
- **Contextual Understanding**: Models like BERT can grasp nuances in language, such as polysemy and context-dependent meanings.
- **Sequence Handling**: LSTM and BiLSTM handle sequential data and long-term dependencies better than traditional methods.

**How to Use:**
- **LSTM/BiLSTM**: Use them to model sequential relationships in text data. They can be combined with embeddings to improve performance.
- **Transformers**: Fine-tune pre-trained models like BERT on your specific dataset. They often yield superior results for various NLP tasks.

### Combining Both Approaches

- **Embeddings + Deep Learning Models**: You can use advanced embeddings as input features to deep learning models. For instance, you can use Word2Vec or GloVe embeddings as input to an LSTM or BiLSTM model.
- **Direct Use of Pre-trained Transformers**: Transformers like BERT often incorporate advanced embeddings and are fine-tuned for specific tasks, making them a powerful option for many NLP applications.

### Example Workflow

1. **Using Word Embeddings**:
   - Transform text data into embeddings (Word2Vec, GloVe).
   - Train a neural network (LSTM, BiLSTM) on these embeddings.

2. **Using Transformers**:
   - Fine-tune a pre-trained transformer model (BERT) on your dataset.

### Summary

- **Advanced Embeddings** provide a more sophisticated representation of text compared to TF-IDF.
- **Deep Learning Models** capture more complex patterns and dependencies in text.
- **Transformers** offer cutting-edge performance by leveraging both advanced embeddings and deep learning architectures.

Choosing between these approaches or combining them depends on your specific needs, data characteristics, and available resources.

## BERT (Bidirectional Encoder Representations from Transformers):
- To use Transformers like BERT for content moderation, we'll need to leverage libraries such as transformers from Hugging Face. 
- Here’s a step-by-step guide with code to use BERT for text classification:

### Install Required Libraries
- Make sure we have the required libraries installed. we can install them using pip:

In [16]:
pip install transformers torch datasets tf-keras




In [17]:
pip install accelerate -U




### Load and Preprocess Data
- We need to prepare the data for BERT.
- This involves tokenizing the text using the BERT tokenizer.

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Preprocess and split data
combined_df['lemmatized_comment'] = combined_df['lemmatized_comment'].astype(str)
X = combined_df['lemmatized_comment']
y = combined_df['inappropriate']  # Assuming 'label' column contains 'appropriate' or 'inappropriate'

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', clean_up_tokenization_spaces=True)

# Tokenize the data
def tokenize_data(texts, max_length=128):
    return tokenizer(texts.tolist(), padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

train_encodings = tokenize_data(X_train)
val_encodings = tokenize_data(X_val)


## Create Dataset Objects
- Transformers models require data to be in a specific format.
- Convert the tokenized data into a format suitable for PyTorch.

In [19]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.nn import CrossEntropyLoss
from torch import nn
from torch.utils.data import Dataset


# Detect the available device (MPS, CUDA, or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels.tolist())  # Convert labels to tensor

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create the datasets
train_dataset = CustomDataset(train_encodings, y_train)
val_dataset = CustomDataset(val_encodings, y_val)

# Custom model class to handle weighted loss
class WeightedBertForSequenceClassification(BertForSequenceClassification):
    def __init__(self, config, class_weights=None):
        super().__init__(config)
        self.class_weights = class_weights

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, labels=None):
        # Get output logits from BERT
        outputs = super().forward(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = outputs.logits

        # If labels are provided, compute loss
        if labels is not None:
            # Apply class weights to the loss function
            loss_fct = CrossEntropyLoss(weight=self.class_weights)
            loss = loss_fct(logits, labels)
            return (loss, logits)
        else:
            return logits

# Define class weights (adjust them based on your dataset's distribution)
class_weights = torch.tensor([0.7, 1.3]).to(device)  # If using GPU

# Initialize the custom model with weights
model = WeightedBertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    class_weights=class_weights
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,                         # the instantiated model
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
)





Some weights of WeightedBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialize the Model
Load a pre-trained BERT model and prepare it for fine-tuning.

In [20]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available")

# For a model, you can check where it's placed
model = model.to('cuda')  # Move model to GPU
print(model.device)

CUDA is available. Using GPU: NVIDIA GeForce GTX 1650 Ti
cuda:0


## Train the Model
- Train the model using the Trainer API.
## Evaluate the Model
- Evaluate the model on the validation set.

In [21]:
# Train the model
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.7391
20,0.6927
30,0.6782
40,0.6496
50,0.5774
60,0.5208
70,0.5172
80,0.4478
90,0.3562
100,0.3882


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


KeyboardInterrupt: 

In [None]:
# Evaluate the model
results = trainer.evaluate()
print(results)

# Save the model and tokenizer
trainer.save_model('./saved_model')
tokenizer.save_pretrained('./saved_model')


# # Load the model and tokenizer later
# model = BertForSequenceClassification.from_pretrained('./saved_model')
# tokenizer = BertTokenizer.from_pretrained('./saved_model')


In [None]:
# # Save the model and tokenizer
# trainer.save_model('./saved_model')
# tokenizer.save_pretrained('./saved_model')

# Load the model and tokenizer later
# model = BertForSequenceClassification.from_pretrained('./saved_model')
# tokenizer = BertTokenizer.from_pretrained('./saved_model')

## Make Predictions
- Use the trained model to make predictions.

In [None]:
# Load the model and tokenizer later
# model = BertForSequenceClassification.from_pretrained('./saved_model')
# tokenizer = BertTokenizer.from_pretrained('./saved_model')

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

# Load your trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

# Check if MPS is available and move the model to the MPS device
device = torch.device("mps") if torch.has_mps else torch.device("cpu")
model.to(device)

test_texts = ["This is fine", "You are a f*cking idiot", "I love this!", "You b*tch", "You bitch", "you are awesome at screwing things up","Jim is g@y", "Jan is a le$bian"]

for text in test_texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)
    predictions_proba = torch.softmax(outputs.logits, dim=1)
    label = "Inappropriate" if predictions.item() == 1 else "Appropriate"
    print(f"Text: '{text}' -> Predicted: {label} predictions_proba: -> [{predictions_proba[0][0]}, {predictions_proba[0][1]}]")




In [None]:
pip install numpy

In [None]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

In [None]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer
from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

# Shuffle X_val and y_val and take the first 1000 samples
X_val_sampled, y_val_sampled = shuffle(X_val, y_val, random_state=42)

# Select the first 1000 samples
X_val_sampled = X_val_sampled[:10000]
y_val_sampled = y_val_sampled[:10000]

# Load your trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

# Initialize lists to store predictions and probabilities
predictions_list = []
proba_list = []

# Check if MPS is available and move the model to the MPS device
device = torch.device("mps") if torch.has_mps else torch.device("cpu")
model.to(device)

if torch.cuda.is_available():
    print(f"CUDA is available. Using GPU: {torch.cuda.get_device_name(0)}")
    device = torch.device('cuda')  # Use GPU
else:
    device = torch.device('cpu')   # Fall back to CPU if GPU is not available

print(torch.cuda.current_device())  # Current GPU device ID
print(torch.cuda.get_device_name(torch.cuda.current_device()))  # GPU name

# Move model to the appropriate device
model = model.to(device)
print(model.device)

with torch.no_grad():  # Turn off gradients to save memory during inference
    for lemmatized_comment in X_val_sampled:
        inputs = tokenizer(lemmatized_comment, return_tensors="pt", truncation=True, padding=True)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=1)
        predictions_proba = torch.softmax(outputs.logits, dim=1)
        # Append to lists
        predictions_list.append(predictions.item())
        proba_list.append(predictions_proba[0][1])

# Convert predictions and probabilities to numpy arrays for metric calculation
predictions_array = torch.tensor(predictions_list)
proba_array = torch.tensor(proba_list)

print("calculating accuracy")
# Calculate accuracy, precision, and ROC AUC
accuracy = accuracy_score(y_val_sampled, predictions_array)
precision = precision_score(y_val_sampled, predictions_array)
roc_auc = roc_auc_score(y_val_sampled, proba_array)

# Print the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")


# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y_val_sampled, proba_array)
roc_auc_value = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc_value:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
