In [8]:
!pip install vaderSentiment




In [10]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
import nltk
from nltk.corpus import stopwords
import time


In [11]:
# Download necessary NLTK data
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
# Data Preprocessing for Word Embeddings
def preprocess_text_for_embedding(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return filtered_tokens

In [14]:
# Assuming 'data.csv' is your dataset file with columns ['review', 'sentiment']
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

In [15]:
# Preprocess reviews for embeddings
train_df['tokens'] = train_df['review'].apply(preprocess_text_for_embedding)
test_df['tokens'] = test_df['review'].apply(preprocess_text_for_embedding)


In [17]:
# Initialize VADER
analyzer = SentimentIntensityAnalyzer()


In [18]:
# Apply VADER to get sentiment scores for both datasets
train_df['vader_score'] = train_df['tokens'].apply(lambda tokens: analyzer.polarity_scores(' '.join(tokens))['compound'])
test_df['vader_score'] = test_df['tokens'].apply(lambda tokens: analyzer.polarity_scores(' '.join(tokens))['compound'])


In [19]:
# Train Word2Vec model or load a pre-trained model
word2vec_model = Word2Vec(sentences=train_df['tokens'], vector_size=300, window=8, min_count=5, workers=4, sg=1, epochs=20)

# Function to vectorize a list of tokens using the Word2Vec model
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.wv.index_to_key]
    if not doc:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc], axis=0)


# Vectorize the tokens for each review
X_train_embeddings = np.array([document_vector(word2vec_model, words) for words in train_df['tokens']])
X_test_embeddings = np.array([document_vector(word2vec_model, words) for words in test_df['tokens']])



In [20]:
# Add VADER scores as a feature
X_train_vader = train_df['vader_score'].values.reshape(-1, 1)
X_test_vader = test_df['vader_score'].values.reshape(-1, 1)

# Combine Word Embeddings with VADER scores
X_train_combined = np.hstack((X_train_embeddings, X_train_vader))
X_test_combined = np.hstack((X_test_embeddings, X_test_vader))


In [21]:
# Prepare labels
y_train = train_df['sentiment']
y_test = test_df['sentiment']


In [23]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit LabelEncoder on the target labels
label_encoder.fit(y_train)

# Transform the target labels to numerical values
y_train_encoded = label_encoder.transform(y_train)

In [25]:
# Transform the true labels to numerical values
y_test_encoded = label_encoder.transform(y_test)


In [26]:
# Train the model on the training set
from xgboost import XGBClassifier
model = XGBClassifier()
model.fit(X_train_combined, y_train_encoded)

# Start timing for prediction
start_predict_time = time.time()

# Predict on the test set
predictions = model.predict(X_test_combined)

# End timing for prediction
end_predict_time = time.time()

# Calculate the elapsed time for prediction
predict_elapsed_time = end_predict_time - start_predict_time
print(f"Time taken for prediction: {predict_elapsed_time:.4f} seconds")

# Evaluate and print the accuracy
accuracy = accuracy_score(y_test_encoded, predictions)
print("Accuracy on test set:", accuracy)

Time taken for prediction: 0.1408 seconds
Accuracy on test set: 0.87575
