In [None]:
# The code below will:
# Preprocess the text data (tokenization, stemming, POS tagging, and extracting sentiment words using a sentiment lexicon).
# Use RoBERTa to obtain word embeddings.
# Use BiGRU with an attention mechanism to select important features.
# Apply PCA for dimensionality reduction.
# Use a sigmoid activation function to classify the sentiment as positive or negative.
# Train the model with the training dataset, then evaluate and test the model on the test dataset.
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaModel
import torch
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, GRU, Attention, Dense
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('sentiwordnet')
nltk.download('averaged_perceptron_tagger')
# Initialize stemmer
stemmer = PorterStemmer()
# Define function to get sentiment score from Wide Coverage Sentiment Lexicon
def get_sentiment_score(word):
try:
synsets = list(wcsl.senti_synset(word+'.a.01')) # synset for sentiment analysis
sentiment_score = 0
for synset in synsets:
sentiment_score += synset.pos_score() - synset.neg_score()
return sentiment_score
except:
return 0
# Text preprocessing function
def preprocess_text(text):
tokens = word_tokenize(text.lower()) # Tokenize and convert to lowercase
pos_tags = pos_tag(tokens) # Part of speech tagging
stemmed_tokens = [stemmer.stem(token) for token, tag in pos_tags if tag in ['NN', 'VB', 'JJ', ‘RB’]] # Stemming and filtering based on POS
sentiment_score = sum([get_sentiment_score(word) for word in stemmed_tokens]) # Sentiment score based on lexicon
return stemmed_tokens, sentiment_score
# Sample data: Reviews and labels (1: Positive, 0: Negative)
reviews = [
"I loved this movie! Great acting.",
"Terrible movie, very boring.",
"Amazing movie with stunning visuals!",
"Not worth the watch, it was a waste of time.",
"Fantastic movie, a true masterpiece!",
"The plot was boring and predictable.",
"Really good film, I would recommend it!",
"Poorly made, no substance at all.",
"Absolutely fantastic, will watch again!",
"It was okay, nothing special though."
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 1] # Sentiment labels
# Preprocess all reviews
processed_data = [preprocess_text(review) for review in reviews]
# Tokenized and sentiment scores
tokens = [item[0] for item in processed_data]
sentiment_scores = [item[1] for item in processed_data]
# Apply RoBERTa for word embeddings
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')
def get_roberta_embeddings(text):
tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
embeddings = roberta_model(**tokens).last_hidden_state
return embeddings.mean(dim=1).numpy() # Get average embedding for the sentence
# Get embeddings for all reviews
embeddings = [get_roberta_embeddings(review) for review in reviews]
embeddings = np.array(embeddings).reshape(len(embeddings), -1)
# Standardize the data before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(embeddings)
# Apply PCA to reduce dimensionality to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
# Reshape PCA result for input into GRU model
X_pca_reshaped = X_pca[..., np.newaxis]
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca_reshaped, labels, test_size=0.2, random_state=42)
# Build the BiGRU model with attention mechanism
def build_bigru_attention_model(input_shape):
model = Sequential()
model.add(Bidirectional(GRU(64, return_sequences=True), input_shape=input_shape))
model.add(Attention())
model.add(Dense(1, activation='sigmoid')) # Binary output layer for sentiment classification
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
return model
# Initialize the model
model = build_bigru_attention_model((X_pca_reshaped[0].shape[0], 1))
model.summary()
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=2, validation_data=(X_test, y_test))
# Evaluate the model on the test set
y_pred = (model.predict(X_test) > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
# Visualize the PCA result
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', marker='o')
plt.title('PCA of Movie Reviews')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Review Sentiment (0: Negative, 1: Positive)')
plt.show()
# Explanation of Code:
# Text Preprocessing: The preprocess_text function handles tokenization, part-of-speech tagging, stemming, and sentiment word extraction using the SentiWordNet lexicon.
# RoBERTa Word Embedding: We use the RoBERTa model to transform the reviews into high-dimensional embeddings.
# BiGRU Model with Attention: A BiGRU layer with an attention mechanism is used to capture important sequential features. The output layer uses a sigmoid function for binary classification (positive or negative).
# PCA for Dimensionality Reduction: PCA
# could you please give me the output in plot or chart form for the above code
# To generate the output as a plot or chart, we can use the PCA visualization to show the reduced-dimensionality results of the movie reviews. The plot will display the two principal components from the PCA, colored based on the sentiment labels (0 for Negative, 1 for Positive).
# Here’s how the code visualizes the PCA result, including a scatter plot with the sentiment labels:
# PCA Visualization: After applying PCA for dimensionality reduction to 2 components, we plot the results on a scatter plot. The points will be colored according to their sentiment (0: Negative, 1: Positive).
# Accuracy Plot: The model's training and validation accuracy can also be plotted to visualize how well the model performs over epochs.
# How to Generate the Plot
# The matplotlib library can be used for visualization. Below is the relevant part of the code that shows the PCA visualization. This code will plot the PCA results, and we will also plot training vs validation accuracy for the model.
import matplotlib.pyplot as plt
# PCA visualization: Plot the first two principal components
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', marker='o')
plt.title('PCA of Movie Reviews')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(scatter, label='Review Sentiment (0: Negative, 1: Positive)')
plt.show()
# Plot the accuracy of the model during training
plt.figure(figsize=(8, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy during Training')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

Build and Train the Model (model.fit())
# Sample sentiment labels (binary: 1 for positive, 0 for negative)
y = [1, 0] # Example: Positive, Negative sentiment for the reviews
# Reshape PCA result for GRU model input
X_train_reshaped = X_pca[..., np.newaxis]
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_reshaped, y, test_size=0.2, random_state=42)
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=4, validation_data=(X_test, y_test))
# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int)
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
Evaluate the Model (model.predict())
# To predict on new test data, use model.predict()
test_data = X_test # Your test data here
predictions = (model.predict(test_data) > 0.5).astype(int) # Predict class labels (0 or 1)
# You can compare predictions with true labels (y_test) for performance evaluation
print(f'Predictions: {predictions}')
print(f'True Labels: {y_test}')

[<module>] => loading model from /content/drive/My Drive/ehpi_action_recognition/data/models/pose_resnet_50_256x192.pth
Traceback (most recent call last):
  File "run_ehpi.py", line 160, in <module>
    "{}.avi".format(str(frame_nr).zfill(5))), img)
TypeError: only size-1 arrays can be converted to Python scalars
