# Advanced Text Vectorization for Toxic Comment Classification

This notebook compares traditional vectorization methods (TF-IDF, Count) with advanced embedding techniques (Word2Vec, GloVe, FastText) on the toxic comment classification task.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import LinearSVC
import nltk
import time
import gensim
from gensim.models import Word2Vec, FastText
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
import multiprocessing
n_cores = multiprocessing.cpu_count()
print(f"Available CPU cores: {n_cores}")

Available CPU cores: 16


In [3]:
# Download necessary NLTK resources
nltk.download('punkt', quiet=True)

True

In [4]:
# Load preprocessed data
train_data = pd.read_csv('../Dataset/train_preprocessed.csv')

# Check the data
print(f"Training data shape: {train_data.shape}")
train_data.head()

Training data shape: (159571, 9)


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,processed_text
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,daww match background colour im seemingly stuc...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man im really not trying edit war guy cons...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,cant make real suggestion improvement wondered...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page thats


In [5]:
# Define the features and target labels
X = train_data['processed_text']  # Use the preprocessed text
y = train_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

# Handle missing values
X = X.fillna("")  # Replace NaN values with empty strings
y = y.fillna(0)   # Replace any missing target values with 0

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y['toxic']
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

Training set size: 127656
Validation set size: 31915


## 1. Define Advanced Vectorizers

Let's implement custom vectorizers for Word2Vec, GloVe, and FastText embeddings.

In [6]:
# Word2Vec Vectorizer
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4, sg=1):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.sg = sg  # 0 for CBOW, 1 for Skip-gram
        self.model = None
        self.word_vectors = None
        
    def fit(self, X, y=None):
        # Tokenize the text
        tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in tqdm(X, desc="Tokenizing for Word2Vec")]
        
        # Train Word2Vec model
        print("Training Word2Vec model...")
        self.model = Word2Vec(
            sentences=tokenized_corpus,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers,
            sg=self.sg
        )
        
        self.word_vectors = self.model.wv
        print(f"Word2Vec model trained with {len(self.word_vectors.key_to_index)} words")
        return self
    
    def transform(self, X):
        tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in tqdm(X, desc="Vectorizing with Word2Vec")]
        
        # Create document vectors by averaging word vectors
        doc_vectors = np.zeros((len(tokenized_corpus), self.vector_size))
        for i, tokens in enumerate(tokenized_corpus):
            vec = np.zeros(self.vector_size)
            count = 0
            for token in tokens:
                if token in self.word_vectors:
                    vec += self.word_vectors[token]
                    count += 1
            if count > 0:
                vec /= count
            doc_vectors[i] = vec
        
        return doc_vectors

In [7]:
# FastText Vectorizer
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100, window=5, min_count=1, workers=4):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model = None
        
    def fit(self, X, y=None):
        # Tokenize the text
        tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in tqdm(X, desc="Tokenizing for FastText")]
        
        # Train FastText model
        print("Training FastText model...")
        self.model = FastText(
            sentences=tokenized_corpus,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers
        )
        
        print(f"FastText model trained with {len(self.model.wv.key_to_index)} words")
        return self
    
    def transform(self, X):
        tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in tqdm(X, desc="Vectorizing with FastText")]
        
        # Create document vectors by averaging word vectors
        doc_vectors = np.zeros((len(tokenized_corpus), self.vector_size))
        for i, tokens in enumerate(tokenized_corpus):
            vec = np.zeros(self.vector_size)
            count = 0
            for token in tokens:
                # FastText can handle OOV words
                vec += self.model.wv[token]
                count += 1
            if count > 0:
                vec /= count
            doc_vectors[i] = vec
        
        return doc_vectors

In [8]:
# GloVe - Using pre-trained embeddings
class GloveVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100):
        self.vector_size = vector_size
        self.word_vectors = {}
        
    def fit(self, X, y=None):
        # Attempt to download pre-trained GloVe using gensim downloader
        try:
            import gensim.downloader as api
            print("Downloading pre-trained GloVe embeddings...")
            # Use a smaller model for demonstration
            glove_model = api.load("glove-wiki-gigaword-100")
            self.word_vectors = {word: glove_model[word] for word in glove_model.key_to_index}
            print(f"Loaded GloVe embeddings with {len(self.word_vectors)} words")
        except Exception as e:
            print(f"Error loading GloVe: {e}")
            print("Will use an empty embedding. Results will be poor.")
        
        return self
    
    def transform(self, X):
        tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in tqdm(X, desc="Vectorizing with GloVe")]
        
        # Create document vectors by averaging word vectors
        doc_vectors = np.zeros((len(tokenized_corpus), self.vector_size))
        for i, tokens in enumerate(tokenized_corpus):
            vec = np.zeros(self.vector_size)
            count = 0
            for token in tokens:
                if token in self.word_vectors:
                    vec += self.word_vectors[token]
                    count += 1
            if count > 0:
                vec /= count
            doc_vectors[i] = vec
        
        return doc_vectors

## 2. Define Model Evaluation Function

In [9]:
def evaluate_model(model, X, y, model_name):
    start_time = time.time()
    y_pred = model.predict(X)
    inference_time = time.time() - start_time
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y, y_pred)
    
    # Calculate F1 scores for each class
    f1_scores = []
    for i, column in enumerate(y.columns):
        f1 = f1_score(y[column], y_pred[:, i])
        f1_scores.append(f1)
    
    # Calculate macro and micro F1
    macro_f1 = np.mean(f1_scores)
    micro_f1 = f1_score(y, y_pred, average='micro')
    
    print(f"\n============ {model_name} Results ============")
    print(f"Inference time: {inference_time:.2f} seconds")
    print(f"Validation accuracy: {accuracy:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"Micro F1: {micro_f1:.4f}")
    
    print("\nF1 scores by toxicity type:")
    for i, column in enumerate(y.columns):
        f1 = f1_score(y[column], y_pred[:, i])
        print(f"{column}: {f1:.4f}")
    
    results = {
        'model_name': model_name,
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'micro_f1': micro_f1,
        'inference_time': inference_time
    }
    
    for i, column in enumerate(y.columns):
        results[f'f1_{column}'] = f1_scores[i]
    
    return results, y_pred

## 3. Train and Evaluate Traditional Models

In [10]:
# TF-IDF + SVM
tfidf_svm_model = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        min_df=2,
        max_df=0.8,
        ngram_range=(1, 2)
    )),
    ('classifier', MultiOutputClassifier(LinearSVC(
        C=1.0,
        max_iter=10000,
        dual=False,
        class_weight='balanced',
        random_state=42
    )))
])

print("Training TF-IDF + SVM model...")
start_time = time.time()
tfidf_svm_model.fit(X_train, y_train)
train_time = time.time() - start_time
print(f"Training completed in {train_time:.2f} seconds")

tfidf_svm_results, tfidf_svm_preds = evaluate_model(tfidf_svm_model, X_val, y_val, "TF-IDF + SVM")

Training TF-IDF + SVM model...
Training completed in 30.53 seconds

Inference time: 1.11 seconds
Validation accuracy: 0.8693
Macro F1: 0.5317
Micro F1: 0.6569

F1 scores by toxicity type:
toxic: 0.7220
severe_toxic: 0.3953
obscene: 0.7348
threat: 0.3361
insult: 0.6319
identity_hate: 0.3702


In [11]:
# Count Vectorizer + Logistic Regression
count_lr_model = Pipeline([
    ('count', CountVectorizer(
        max_features=20000,
        min_df=2,
        max_df=0.8,
        ngram_range=(1, 2)
    )),
    ('classifier', MultiOutputClassifier(LogisticRegression(
        C=5.0,
        solver='liblinear',
        max_iter=200,
        random_state=42,
        n_jobs=n_cores
    )))
])

print("Training Count Vectorizer + Logistic Regression model...")
start_time = time.time()
count_lr_model.fit(X_train, y_train)
train_time = time.time() - start_time
print(f"Training completed in {train_time:.2f} seconds")

count_lr_results, count_lr_preds = evaluate_model(count_lr_model, X_val, y_val, "Count Vectorizer + Logistic Regression")

Training Count Vectorizer + Logistic Regression model...
Training completed in 82.05 seconds

Inference time: 1.10 seconds
Validation accuracy: 0.9120
Macro F1: 0.3940
Micro F1: 0.6100

F1 scores by toxicity type:
toxic: 0.7276
severe_toxic: 0.2396
obscene: 0.5845
threat: 0.1138
insult: 0.5394
identity_hate: 0.1589


## 4. Train and Evaluate Word2Vec Model

In [None]:
# Word2Vec + SVM
w2v_vectorizer = Word2VecVectorizer(vector_size=100, window=5, min_count=1, workers=n_cores, sg=1)

# Fit the vectorizer to get word embeddings
print("Fitting Word2Vec vectorizer...")
start_time = time.time()
w2v_vectorizer.fit(X_train)
train_time = time.time() - start_time
print(f"Word2Vec training completed in {train_time:.2f} seconds")

# Transform training data
print("Transforming training data with Word2Vec...")
X_train_w2v = w2v_vectorizer.transform(X_train)

# Train classifier
w2v_classifier = MultiOutputClassifier(LinearSVC(
    C=1.0,
    max_iter=10000,
    dual=False,
    class_weight='balanced',
    random_state=42
))

print("Training Word2Vec + SVM classifier...")
start_time = time.time()
w2v_classifier.fit(X_train_w2v, y_train)
train_time = time.time() - start_time
print(f"Classifier training completed in {train_time:.2f} seconds")

# Transform validation data and evaluate
print("Transforming validation data with Word2Vec...")
X_val_w2v = w2v_vectorizer.transform(X_val)

# Create a class for evaluation that behaves like a pipeline
class ModelWrapper:
    def __init__(self, vectorizer, classifier):
        self.vectorizer = vectorizer
        self.classifier = classifier
    
    def predict(self, X):
        X_transformed = self.vectorizer.transform(X)
        return self.classifier.predict(X_transformed)

w2v_model = ModelWrapper(w2v_vectorizer, w2v_classifier)
w2v_results, w2v_preds = evaluate_model(w2v_model, X_val, y_val, "Word2Vec + SVM")

Fitting Word2Vec vectorizer...


Tokenizing for Word2Vec:   0%|          | 0/127656 [00:00<?, ?it/s]

Training Word2Vec model...
Word2Vec model trained with 177677 words
Word2Vec training completed in 32.67 seconds
Transforming training data with Word2Vec...


Vectorizing with Word2Vec:   0%|          | 0/127656 [00:00<?, ?it/s]

TypeError: LinearSVC.__init__() got an unexpected keyword argument 'n_jobs'

## 5. Train and Evaluate FastText Model

In [None]:
# FastText + SVM
fasttext_vectorizer = FastTextVectorizer(vector_size=100, window=5, min_count=1, workers=n_cores)

# Fit the vectorizer to get word embeddings
print("Fitting FastText vectorizer...")
start_time = time.time()
fasttext_vectorizer.fit(X_train)
train_time = time.time() - start_time
print(f"FastText training completed in {train_time:.2f} seconds")

# Transform training data
print("Transforming training data with FastText...")
X_train_fasttext = fasttext_vectorizer.transform(X_train)

# Train classifier
fasttext_classifier = MultiOutputClassifier(LinearSVC(
    C=1.0,
    max_iter=10000,
    dual=False,
    class_weight='balanced',
    random_state=42,
    n_jobs=n_cores
))

print("Training FastText + SVM classifier...")
start_time = time.time()
fasttext_classifier.fit(X_train_fasttext, y_train)
train_time = time.time() - start_time
print(f"Classifier training completed in {train_time:.2f} seconds")

# Transform validation data and evaluate
print("Transforming validation data with FastText...")
X_val_fasttext = fasttext_vectorizer.transform(X_val)

fasttext_model = ModelWrapper(fasttext_vectorizer, fasttext_classifier)
fasttext_results, fasttext_preds = evaluate_model(fasttext_model, X_val, y_val, "FastText + SVM")

Fitting FastText vectorizer...


Tokenizing for FastText:   0%|          | 0/127656 [00:00<?, ?it/s]

Training FastText model...
FastText model trained with 177677 words
FastText training completed in 70.58 seconds
Transforming training data with FastText...


Vectorizing with FastText:   0%|          | 0/127656 [00:00<?, ?it/s]

Training FastText + SVM classifier...
Classifier training completed in 25.92 seconds
Transforming validation data with FastText...


Vectorizing with FastText:   0%|          | 0/31915 [00:00<?, ?it/s]

Vectorizing with FastText:   0%|          | 0/31915 [00:00<?, ?it/s]


Inference time: 9.50 seconds
Validation accuracy: 0.7869
Macro F1: 0.3479
Micro F1: 0.4254

F1 scores by toxicity type:
toxic: 0.5972
severe_toxic: 0.2493
obscene: 0.5490
threat: 0.0567
insult: 0.4870
identity_hate: 0.1483


## 6. Train and Evaluate GloVe Model

In [None]:
# GloVe + SVM
glove_vectorizer = GloveVectorizer(vector_size=100)

# Fit the vectorizer to get word embeddings
print("Fitting GloVe vectorizer...")
start_time = time.time()
glove_vectorizer.fit(X_train)
train_time = time.time() - start_time
print(f"GloVe preparation completed in {train_time:.2f} seconds")

# Transform training data
print("Transforming training data with GloVe...")
X_train_glove = glove_vectorizer.transform(X_train)

# Train classifier
glove_classifier = MultiOutputClassifier(LinearSVC(
    C=1.0,
    max_iter=10000,
    dual=False,
    class_weight='balanced',
    random_state=42,
    n_jobs=n_cores
))

print("Training GloVe + SVM classifier...")
start_time = time.time()
glove_classifier.fit(X_train_glove, y_train)
train_time = time.time() - start_time
print(f"Classifier training completed in {train_time:.2f} seconds")

# Transform validation data and evaluate
print("Transforming validation data with GloVe...")
X_val_glove = glove_vectorizer.transform(X_val)

glove_model = ModelWrapper(glove_vectorizer, glove_classifier)
glove_results, glove_preds = evaluate_model(glove_model, X_val, y_val, "GloVe + SVM")

Fitting GloVe vectorizer...
Downloading pre-trained GloVe embeddings...


## 7. Compare All Models

In [None]:
# Collect all results
all_results = [tfidf_svm_results, count_lr_results, w2v_results, fasttext_results, glove_results]
results_df = pd.DataFrame(all_results)

# Display comparison table
print("\n=========== Model Comparison ===========\n")
comparison_cols = ['model_name', 'accuracy', 'macro_f1', 'micro_f1', 'inference_time']
print(results_df[comparison_cols].sort_values('macro_f1', ascending=False))

In [None]:
# Visualize macro F1 scores
plt.figure(figsize=(12, 6))
sns.barplot(x='model_name', y='macro_f1', data=results_df.sort_values('macro_f1', ascending=False))
plt.title('Macro F1 Score by Model')
plt.xlabel('Model')
plt.ylabel('Macro F1 Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Visualize F1 scores by toxicity category
category_results = []
for result in all_results:
    model_name = result['model_name']
    for col in y_val.columns:
        category_results.append({
            'model': model_name,
            'category': col,
            'f1_score': result[f'f1_{col}']
        })

category_df = pd.DataFrame(category_results)

plt.figure(figsize=(14, 8))
sns.barplot(x='category', y='f1_score', hue='model', data=category_df)
plt.title('F1 Score by Toxicity Category and Model')
plt.xlabel('Toxicity Category')
plt.ylabel('F1 Score')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 8. Analysis and Findings

Let's analyze the performance of different vectorization methods for toxic comment classification:

1. **TF-IDF with SVM** generally performs well for text classification tasks, especially when we have a good vocabulary coverage.

2. **Word2Vec** captures semantic relationships between words, which can help with understanding context beyond simple word presence.

3. **FastText** can handle out-of-vocabulary words through subword information, which is particularly useful for toxic comments that often contain misspellings and made-up words.

4. **GloVe** pre-trained embeddings capture global word co-occurrence statistics, which can provide good semantic representation.

5. **Performance by category**: Note how different models perform on various toxicity categories. Some models might be better at detecting certain types of toxicity than others.

## 9. Conclusion

Based on the results, we can make the following conclusions:

1. For production use, the best model would be [determined by the best performer from the comparison above].

2. Word embeddings can capture semantic meaning that might be missed by traditional bag-of-words approaches, potentially improving detection of subtle toxic content.

3. FastText's ability to handle misspellings and rare words makes it particularly suitable for social media content and comments where users might intentionally obfuscate toxic language.

4. Model selection should be based not just on overall performance but also on specific requirements, like performance on certain toxicity categories or inference time constraints.

In [None]:
# Save the best model
best_model_name = results_df.sort_values('macro_f1', ascending=False).iloc[0]['model_name']
print(f"The best performing model is: {best_model_name}")

# Based on the model name, save the corresponding model
import pickle
import os

os.makedirs('../models', exist_ok=True)

if best_model_name == "TF-IDF + SVM":
    with open('../models/tfidf_svm_model.pkl', 'wb') as f:
        pickle.dump(tfidf_svm_model, f)
    print("Model saved as tfidf_svm_model.pkl")
elif best_model_name == "Word2Vec + SVM":
    # For Word2Vec, save separately due to size
    with open('../models/w2v_vectorizer.pkl', 'wb') as f:
        pickle.dump(w2v_vectorizer, f)
    with open('../models/w2v_classifier.pkl', 'wb') as f:
        pickle.dump(w2v_classifier, f)
    print("Model saved as w2v_vectorizer.pkl and w2v_classifier.pkl")
# Add similar conditions for other models

print("\nDone!")