# test 3 sentiment models

In [1]:
# !python -m spacy download en_core_web_sm
# !pip install spacytextblob


In [2]:
import pandas as pd
import gcsfs
from textblob import TextBlob
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import pipeline
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from google.cloud import storage

fs = gcsfs.GCSFileSystem()

path = f'gs://amazon-home-and-kitchen/full_train_data_txt_processed.csv'
df = pd.read_csv(path, dtype={'price': float,'subtitle': str }, na_values=['—'], nrows=1000)     # ONLY READING 1000 ROWS, REMOVE FOR DATA PROCESSING STEP

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2024-11-10 20:23:02.355498: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-10 20:23:02.360099: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-10 20:23:02.374239: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731270182.398422  168962 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731270182.405505  168962 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been r

In [3]:
df['processed_text'] = df['processed_text'].astype(str)

In [4]:
# Apply TextBlob sentiment analysis
df['sentiment_text_blob'] = df['processed_text'].apply(lambda text: TextBlob(text).sentiment.polarity)

In [5]:
# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis
df['sentiment_vader'] = df['processed_text'].apply(lambda text: sia.polarity_scores(text)['compound'])

In [6]:
# Load the sentiment-analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Apply the pipeline to each entry in the column
df['sentiment_bert'] = df['processed_text'].apply(lambda text: sentiment_pipeline(text)[0]['label'])

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
2024-11-10 20:23:06.960967: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [8]:
df.head()

Unnamed: 0,main_category,title_x,average_rating,rating_number,features,description,price,images_x,videos,store,...,asin,user_id,timestamp,helpful_vote,verified_purchase,processed_text,sentiment_text_blob,sentiment_vader,sentiment_bert,sentiment_spacy
0,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"['Sleek chrome metal base, seat covered in Red...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,...,B00KKU8HVE,AHED326L5JZMCI2GAXFXNKZGQKNA,2015-07-24 14:37:07.000,0,True,nice,0.6,0.4215,POSITIVE,0.6
1,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"['Sleek chrome metal base, seat covered in Red...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,...,B00KKU8HVE,AHROHCEEEGFMNYMMWXQCHLPBVVLQ,2016-11-24 17:43:29.000,0,True,chair beautiful bought 4 black color stylish c...,0.354762,0.9153,POSITIVE,0.354762
2,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"['Sleek chrome metal base, seat covered in Red...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,...,B00KKU8HUA,AHH4QDZVVTYKMCM7UYL7KD24GIPA,2017-02-11 08:46:16.000,0,True,horible build quality stool dont lock place si...,0.275,-0.6369,NEGATIVE,0.275
3,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"['Sleek chrome metal base, seat covered in Red...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,...,B00KKU8HVE,AHBDGI6TOGE33U5W6VEP55NEKC6Q,2016-12-14 02:22:24.000,0,True,great quality especially pricebr br ive swivel...,0.447917,0.9538,POSITIVE,0.447917
4,Amazon Home,jersey seating 2 x Vinyl Air Lift Adjustable S...,4.3,167,"['Sleek chrome metal base, seat covered in Red...",[],,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'Small and Stylish Barstools ', 'ur...",jersey seating®,...,B00KKU8HVE,AHTYSHSOLY4YTIPY53BRRJ2T325Q,2018-07-02 01:49:55.194,0,True,great price worked perfectly small space,0.275,0.8519,POSITIVE,0.275


In [9]:
# Specify GCS bucket and path
bucket_name = 'amazon-home-and-kitchen'
destination_blob_name = 'sentiment_test_2024_11_09.csv'

# Save DataFrame as CSV locally first
df.to_csv('/tmp/sentiment_test_2024_11_09.csv', index=False)

# Initialize a GCS client and upload wwthe file
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename('/tmp/sentiment_test_2024_11_09.csv')

# Compare models

In [10]:
# import pandas as pd
# import gcsfs
# from textblob import TextBlob
# import nltk
# nltk.download('vader_lexicon')
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# from transformers import pipeline
# import spacy
# from spacytextblob.spacytextblob import SpacyTextBlob
# from google.cloud import storage

fs = gcsfs.GCSFileSystem()

path = f'gs://amazon-home-and-kitchen/sentiment_test_2024_11_09.csv'
df = pd.read_csv(path)     # ONLY READING 1000 ROWS, REMOVE FOR DATA PROCESSING STEP
df=df[['rating', 'title_y', 'text', 'timestamp', 'helpful_vote', 'verified_purchase', 'processed_text',
       'sentiment_text_blob', 'sentiment_vader', 'sentiment_bert']]

In [20]:
# Selecting only the sentiment columns for averaging
sentiment_columns = ['sentiment_text_blob', 
                     'sentiment_vader', 
                     # 'sentiment_bert',
                    ]

# Group by 'rating' and calculate the mean for each sentiment column
avg_sentiments = df.groupby('rating')[sentiment_columns].mean().reset_index()

# Display the result
print(avg_sentiments)


   rating  sentiment_text_blob  sentiment_vader
0       1             0.003601        -0.022158
1       2             0.029882         0.167878
2       3             0.102977         0.376950
3       4             0.219193         0.521452
4       5             0.332610         0.648425


In [17]:
# I think this is a difference of scaling... why are spacy and text blob exact?