<a href="https://colab.research.google.com/github/Viny2030/UNED/blob/main/Text_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
organizations_snap_amazon_fine_food_reviews_path = kagglehub.dataset_download('organizations/snap/amazon-fine-food-reviews')

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df1 = pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv").head(500)

In [None]:
#df1

# Quick EDA

In [None]:
df1.columns

In [None]:
ax = df1["Score"].value_counts().sort_index() \
    .plot(kind = "bar",
          title = "Countplot of Reviews by Stars",
          figsize = (10,5))

ax.set_xlabel("Reviewed Stars")
plt.show()

# Basic NLTK

In [None]:
entry = df1["Text"][45]
print(entry)

In [None]:
tokens = nltk.word_tokenize(entry)
#tokens

In [None]:
#part of speech tagging - Nown Singular etc
tags = nltk.pos_tag(tokens)
#tags

In [None]:
entities = nltk.chunk.ne_chunk(tags)
#entities.pprint()

# VADER Sentiment Scoring

#### Uses a "bag of words" approach.
#### Each word is scoreed and combined to a total score.

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia  = SentimentIntensityAnalyzer()

In [None]:
sia.polarity_scores("I am not very sure about this")

In [None]:
#df1.iterrows(), tqdm

In [None]:
#polarity scores on Entire dataset
res = {}
for i, row in tqdm(df1.iterrows(),total = len(df1)):
    text = row["Text"]
    myid = row["Id"]
    res[myid] = sia.polarity_scores(text)

In [None]:
#https://www.youtube.com/watch?v=QpzMWQvxXWk&ab_channel=RobMulla

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {"index":"Id"})
vaders = vaders.merge(df1, how = "left")

In [None]:
#sentiment score and meta data
#vaders.head()


# VADER results on plot

In [None]:
plt.figure(figsize = (10,5))

ax = sns.barplot(data =vaders, x = "Score", y = "compound")
ax.set_title("Compound score by star reviews")
plt.show()

In [None]:
fig,axs = plt.subplots(1,3,figsize = (12,3))

sns.barplot(data = vaders, x = "Score", y = "pos", ax = axs[0])
sns.barplot(data = vaders, x = "Score", y = "neu", ax = axs[1])
sns.barplot(data = vaders, x = "Score", y = "neg", ax = axs[2])

axs[0].set_title("Score vs positive")
axs[1].set_title("Score vs neutral")
axs[2].set_title("Score vs negative")

plt.tight_layout()
plt.show()

# Roberta Pretrained Model

##### Use a model trained of a large corpus of data
##### Transformer model accounts for the words but also the contxt related to words.\

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification  as AMSC
from scipy.special import softmax

AutoTokenizer is a class in the Hugging Face Transformers library that automatically selects the appropriate tokenizer for a given model. It preprocesses text by converting it into tokens, which are then mapped to numerical representations for model input.

In [None]:
#pre trained model based on a twitter dataset
RMODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
#GPT2MODEL = f"gpt2"

In [None]:
#https://github.com/almarengo/gpt2-text-classification/blob/main/GPT2_Transfer_Learning_final.ipynb

In [None]:
#import tensorflow as tf
#from transformers import GPT2Tokenizer, TFGPT2Model

In [None]:
tokenizer_roberta = AutoTokenizer.from_pretrained(RMODEL)
model_roberta = AMSC.from_pretrained(RMODEL)

#tokenizer_gpt2 = AutoTokenizer.from_pretrained(GPT2MODEL)
#model_gpt2 = AMSC.from_pretrained(GPT2MODEL)

In [None]:
# VADER results on entry

print(entry)
sia.polarity_scores(entry)

In [None]:
# Roberta results on entry

encoded_text = tokenizer_roberta(entry, return_tensors = "pt")
output = model_roberta(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

scores_dict = {
    "roberta_neg": scores[0],
     "roberta_neu": scores[1],
     "roberta_pos": scores[2]
}

print(scores_dict)

In [None]:
#return_tensors = "pt" signifies that result is in a format suitable for pytorch (pytorch tensors)
def polarity_score_roberta(ex):
    encoded_text = tokenizer_roberta(ex,return_tensors = "pt")
    output = model_roberta(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        "roberta_neg": scores[0],
        "roberta_neu": scores[1],
        "roberta_pos": scores[2]
    }

    return scores_dict

In [None]:
def polarity_score_gpt2(ex):
    encoded_text = tokenizer_gpt2(ex,return_tensors = "pt")
    output = model_gpt2(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        "roberta_neg": scores[0],
        "roberta_neu": scores[1],
        "roberta_pos": scores[2]
    }

    return scores_dict

In [None]:
res = {}

for i, row in tqdm(df1.iterrows(),total = len(df1)):
    try:
        text = row["Text"]
        myid = row["Id"]
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}

        for key,value in vader_result.items():
            vader_result_rename[f"vader:_{key}"] = value

        roberta_result = polarity_score_roberta(text)

        both = {**vader_result,**roberta_result}

        res[myid] = both

    except RuntimeError:
        print(f"Broke for Id {myid}")

In [None]:
both

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index':'Id'})
results_df = results_df.merge(df1,how = "left")

### Compare Scores between Models

In [None]:
results_df.columns

In [None]:
results_df.columns

### Combine and Compare

In [None]:
sns.pairplot(data = results_df, vars = ["neg","neu","pos",
                                       "roberta_neg","roberta_neu","roberta_pos",],
             hue = "Score",palette = "tab10")

plt.show()

## Reviewing Examples

### Positive 1 star reviews and Negative 5 star reviews

#### The examples where the model scoring and review score differ the most.

In [None]:
#vader
results_df.query("Score == 1").sort_values("pos",ascending = False)['Text'].values[0]

In [None]:
#roberta
results_df.query("Score == 1").sort_values("roberta_pos",ascending = False)['Text'].values[0]

In [None]:
#vader
results_df.query("Score == 5").sort_values("pos",ascending = False)['Text'].values[0]

In [None]:
#roberta
results_df.query("Score == 5").sort_values("roberta_pos",ascending = False)['Text'].values[0]

## The Transformers Pipeline

In [None]:
from transformers import pipeline

senti_pipeline = pipeline("sentiment-analysis")

In [None]:
senti_pipeline("I had an apple")