Word2Vec and TextCNN

Word2Vec

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import numpy as np

# Sample data (replace with your dataset)
texts = ["This is a positive sentence.", "Negative sentiment here.", "Another positive example."]
labels = ["positive", "negative", "positive"]

# Tokenize and create Word2Vec embeddings
tokenized_texts = [text.lower().split() for text in texts]
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)
embedding_dim = word2vec_model.vector_size

# Convert texts to sequences of indices
X = torch.tensor([word2vec_model.wv[word] for sentence in tokenized_texts for word in sentence], dtype=torch.float32)
X = X.view(len(tokenized_texts), -1, embedding_dim)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
y = torch.tensor(y, dtype=torch.long)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

The TextCNN - Fully connected layer

In [None]:
# Define the TextCNN model
class TextCNN(nn.Module): # Use PyTorch module to declare the class
    def __init__(self, embedding_dim, num_classes): # The features of the class's object - embedding dimension (how much can be understood) and the number of classes
        super(TextCNN, self).__init__()  # Get the inherited power from  PyTorch's TextCNN
        self.conv1 = nn.Conv1d(embedding_dim, 128, kernel_size=3) # Defining the convolution layer for understanding the embedding dim by focussing on the kernel size at once, using 128 strength
        self.fc1 = nn.Linear(128, num_classes) # making decision on the number of classes based on the 128 strength - fully connected layer

    def forward(self, x): # Forwarding feeding training approch for class identification - input is the x (class labels)
        # x = [batch size, sequence length, embedding dim]
        x = self.conv1(x.permute(0, 2, 1)) # Convolution layer of the TextCNN model - the feature extractor
        # x.permute changes the order of dimensions in the input tensor x - usually (batch_size, sequence_length, embedding_dim)
        # It prepares the input for the convolution operation which takes the parameters (batch_size, sequence_length, embedding_dim)
        # The conv1 is an instance of the convolution layer. It takes the permutated - i.e. transformed input, capturng local pattern from the embedding, kernel size (size of filter), number of filters

        x = nn.functional.relu(x) # ReLu as activation function classifier algorithm
        x = nn.functional.max_pool1d(x, x.size(2)).squeeze(2) # Max pooling - feature selection for best prediction, reducing the dimensionality of the features. The most important features
        x = self.fc1(x) # The fully connected layers - high level features extracted from the text by the convolutional layer
        return x



Model Instantiation and Training

In [None]:
# Instantiate the model, loss function, and optimizer
model = TextCNN(embedding_dim, num_classes=len(label_encoder.classes_)) # An instance of the TextCNN model
criterion = nn.CrossEntropyLoss() # Loss function  - cross entropy is used, which measures the difference between the predicted class probabilities and the true class labels
optimizer = optim.Adam(model.parameters(), lr=0.001) # Optimizer to update the model parameters (weights and biases) during training. The lr is the learning rate

# Training loop
epochs = 5 # The number of times the model will go through the entire dataset
for epoch in range(epochs):
    model.train() # Put the model in training mode
    optimizer.zero_grad() # The optimizer gradient starts from zero
    outputs = model(X_train) # Forward pass - computes the predicted output given the input
    loss = criterion(outputs, y_train) # Calculated outputs arew compared to the true labels using the specified loss function
    loss.backward() # Backpropagation - computes the gradients of the model parameters with respect to the loss. These are neccesary for adjusting the parameters
    optimizer.step() # Adjust the model parameters
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}") # The epoch and the loss obtained - the lesser, the better

# Evaluate the model
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    accuracy = torch.sum(predicted == y_test).item() / len(y_test)
    print(f"Test Accuracy: {accuracy:.4f}")

TextCNN with only convolution layer

In [None]:
# Define the TextCNN model for only the convolution layer
class TextCNNOnly(nn.Module): # Use PyTorch module to declare the class
    def __init__(self, embedding_dim, num_classes): # The features of the class's object - embedding dimension (how much can be understood) and the number of classes
        super(TextCNNOnly, self).__init__()  # Get the inherited power from  PyTorch
        self.conv1 = nn.Conv1d(embedding_dim, 128, kernel_size=3) # Defining the convolution layer for understanding the embedding dim by focussing on the kernel size at once, using 128 strength

    def forward(self, x): # Forwarding feeding training approch for class identification - input is the x (class labels)
        # x = [batch size, sequence length, embedding dim]
        x = self.conv1(x.permute(0, 2, 1)) # Convolution layer of the TextCNN model - the feature extractor
        # x.permute changes the order of dimensions in the input tensor x - usually (batch_size, sequence_length, embedding_dim)
        # It prepares the input for the convolution operation which takes the parameters (batch_size, sequence_length, embedding_dim)
        # The conv1 is an instance of the convolution layer. It takes the permutated - i.e. transformed input, capturng local pattern from the embedding, kernel size (size of filter), number of filters

        x = nn.functional.relu(x) # ReLu as activation function classifier algorithm
        x = nn.functional.max_pool1d(x, x.size(2)).squeeze(2) # Max pooling - feature selection for best prediction, reducing the dimensionality of the features. The most important features
        return x

Word2Vec, High level features extracted from TextCNN convolution layer passed into ExtraTree classifier

In [None]:
# Extract high-level features from TextCNN (with only convolution layer) for training and testing
text_cnn_model = TextCNNOnly(embedding_dim, num_classes=len(label_encoder.classes_))

with torch.no_grad():
    text_cnn_model.eval()
    X_train_features = text_cnn_model(X_train).numpy()
    X_test_features = text_cnn_model(X_test).numpy()

# Use Extra Trees Classifier (you might want to fine-tune this based on your data)
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train Extra Trees Classifier using high-level features
extra_trees_model.fit(X_train_features, y_train)

# Evaluate the Extra Trees model using high-level features
extra_trees_pred = extra_trees_model.predict(X_test_features)
accuracy = accuracy_score(y_test, extra_trees_pred)
print(f"Extra Trees Test Accuracy: {accuracy:.4f}")


Word2vec embedding and ExtraTree classiifer

In [None]:
# Use Extra Trees Classifier (you might want to fine-tune this based on your data)
extra_trees_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
extra_trees_model.fit(X_train.squeeze().numpy(), y_train)

# Evaluate the Extra Trees model
extra_trees_pred = extra_trees_model.predict(X_test.squeeze().numpy())
accuracy = accuracy_score(y_test, extra_trees_pred)
print(f"Extra Trees Test Accuracy: {accuracy:.4f}")