# Fit 3 sentiment models-
- spacytextblob
- bert
- vader

### Setup

In [1]:
# !python -m spacy download en_core_web_sm
# !pip install spacytextblob
# !pip install vaderSentiment
# !pip install torch

In [2]:
import pandas as pd
import gcsfs
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from google.cloud import storage
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from scipy.special import softmax

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2024-11-15 15:32:14.798273: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731684734.825254  868684 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731684734.833548  868684 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-15 15:32:14.864603: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the ap

In [3]:
fs = gcsfs.GCSFileSystem()

path = f'gs://amazon-home-and-kitchen/full_train_data_txt_processed.csv'
df = pd.read_csv(path, dtype={'price': float,'subtitle': str }, na_values=['—'], nrows=1000)     # ONLY READING 1000 ROWS, REMOVE FOR DATA PROCESSING STEP
df['processed_text'] = df['processed_text'].astype(str)

### Fit models

In [4]:


# Load pre-trained BERT model and tokenizer
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

def get_sentiment_score(text):
    # Tokenize and get model outputs
    inputs = tokenizer(text, return_tensors="tf", truncation=True, max_length=512)  # Changed to 'tf'
    outputs = model(**inputs)
    scores = outputs.logits.numpy()  # No need for .detach() with TensorFlow

    # Apply softmax to get probabilities and map them to -1, 0, or 1
    probs = softmax(scores[0])
    sentiment_score = probs[4] - probs[0]  # High negative to high positive

    return sentiment_score

# Apply the function to the processed_text column
# df['sentiment_bert'] = df['processed_text'].apply(get_sentiment_score)


2024-11-15 15:32:19.510552: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at nlptown/bert-base-multilingual-uncased-sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


## Bert seems the most natural fit, now lets run the whole dataset through our model

In [5]:
fs = gcsfs.GCSFileSystem()


path = f'gs://amazon-home-and-kitchen/topic_2024-11-13.csv'
df = pd.read_csv(path, dtype={'price': float,'subtitle': str }, na_values=['—'])
df['filtered_text'] = df['filtered_text'].astype(str)
df.head()


Unnamed: 0,review_id,main_category,title_x,rating,filtered_text,topic_1,topic_2,topic_3
0,0,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,nice,nice,aa,aaa
1,1,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,chair beautiful bought black color stylish con...,contemporary,stylish,solid
2,2,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,1,horible build quality stool dont lock place si...,sit,full,compress
3,3,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great quality especially pricebr br ive swivel...,swivel,stool,making
4,4,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,5,great price worked perfectly small space,worked,space,perfectly


#### Failing due to size

In [6]:
import numpy as np
num_batches = 100

df_batches = np.array_split(df, num_batches)

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()



df_lst = []
for i, batch in enumerate(df_batches):
    print(f"\nProcessing Batch {i + 1}...")
    
    # batch['sentiment_bert'] = batch['filtered_text'].apply(get_sentiment_score)
    
    batch['sentiment_vader'] = batch['filtered_text'].apply(lambda text: sia.polarity_scores(text)['compound'])
    
    df_lst.append(batch)
    
    
concatenated_df = pd.concat(df_lst, ignore_index=True)


  return bound(*args, **kwds)



Processing Batch 1...

Processing Batch 2...

Processing Batch 3...

Processing Batch 4...

Processing Batch 5...

Processing Batch 6...

Processing Batch 7...

Processing Batch 8...

Processing Batch 9...

Processing Batch 10...

Processing Batch 11...

Processing Batch 12...

Processing Batch 13...

Processing Batch 14...

Processing Batch 15...

Processing Batch 16...

Processing Batch 17...

Processing Batch 18...

Processing Batch 19...

Processing Batch 20...

Processing Batch 21...

Processing Batch 22...

Processing Batch 23...

Processing Batch 24...

Processing Batch 25...

Processing Batch 26...

Processing Batch 27...

Processing Batch 28...

Processing Batch 29...

Processing Batch 30...

Processing Batch 31...

Processing Batch 32...

Processing Batch 33...

Processing Batch 34...

Processing Batch 35...

Processing Batch 36...

Processing Batch 37...

Processing Batch 38...

Processing Batch 39...

Processing Batch 40...

Processing Batch 41...

Processing Batch 42...



In [8]:
# Specify GCS bucket and path
bucket_name = 'amazon-home-and-kitchen'
destination_blob_name = 'topic_2024-11-13_sentiment.csv'

# Save DataFrame as CSV locally first
concatenated_df.to_csv('/tmp/topic_2024-11-13_sentiment.csv', index=False)

# Initialize a GCS client and upload the file
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename('/tmp/topic_2024-11-13_sentiment.csv')

In [18]:
concatenated_df['topic_1'].value_counts(dropna=False)

topic_1
sheet            2855
love             2096
mattress         1429
product          1405
great            1343
                 ... 
whereas             2
issuebr             2
laterbr             1
funcionalidad       1
vertically          1
Name: count, Length: 4999, dtype: int64