In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk
nltk.download("punkt")
plt.style.use("ggplot")
# import nltk


# reading data

In [None]:
df = pd.read_csv("C://Users/pushp/Downloads/amazon_review/Reviews.csv")
df.head()

In [None]:
df = df.head(100000)
df["Score"].value_counts().sort_index().plot(kind = "bar", title = "Count of reviews", color = "lightblue")

In [None]:
ex = df["Text"][51]
print(ex)

# Tokenize the Word

In [None]:
tkns = nltk.word_tokenize(ex)
print(tkns[:10])

# get tags

In [None]:
nltk.download('averaged_perceptron_tagger')
pos_tag = nltk.pos_tag(tkns)

In [None]:
nltk.download('maxent_ne_chunker')  # Download the named entity chunker model
nltk.download('words')
ent = nltk.chunk.ne_chunk(pos_tag)
ent.pprint()

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
nltk.download('vader_lexicon')

In [None]:
sia = SentimentIntensityAnalyzer()
pl_Score = sia.polarity_scores(ex)
print(pl_Score)

In [None]:
res = {}
for i in range(len(df["Text"])):
    text = df["Text"][i]
    myId = df["Id"][i]
    res[myId] = sia.polarity_scores(text)

In [None]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns = {"index" : "Id"})
vaders = vaders.merge(df, how = "left")

In [None]:
vaders

score vs vader_res

In [None]:
fig, axs = plt.subplots(1,3, figsize = (15,3))
plot = sns.barplot(data = vaders, x= "Score", y= "neg",ax = axs[0])
plot = sns.barplot(data = vaders, x= "Score", y= "neu",ax = axs[1])
plot = sns.barplot(data = vaders, x= "Score", y= "pos",ax = axs[2])
axs[0].set_title("neg Score")
axs[1].set_title("neu Score")
axs[2].set_title("pos Score")
# plot.show()

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [None]:
import torch

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


In [None]:
def polarity_score_roberta(example):
    enc_text = tokenizer(example, return_tensors = "pt")
    output = model(**enc_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict =  {
        "roberta_neg" : scores[0],
        "roberta_neu" : scores[1],
        "roberta_pos" : scores[2]
    }
    return scores_dict

In [None]:
x = res[1].keys()
print(list(x)[1])

## getting prob scores for all 1000 text

In [None]:
res = {}
for i in range(1000):
    try:
        text = df["Text"][i]
        myId = df["Id"][i]
        v_res = sia.polarity_scores(text)
        v_res_rename = {}
        for key,value in v_res.items():
            v_res_rename[f"vader_{key}"] = value
        roberta_res = polarity_score_roberta(text)
        both_res = {**v_res_rename, **roberta_res}
        res[myId] = both_res    
    except RuntimeError:
        print(f"broke for id {myId}")


In [None]:
res

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns = {"index" : "Id"})
results_df = results_df.merge(df, how = "left")

In [None]:
results_df.columns

# Visulaizing results from both vader and roberta

In [None]:
sns.pairplot(data = results_df, vars =[ 'vader_neg', 'vader_neu', 'vader_pos',
                                        'roberta_neg', 'roberta_neu', 'roberta_pos'],
                                        hue = "Score",
                                        palette= "tab10")
plt.show()

In [None]:
results_df.query("Score == 5") \
      .sort_values("roberta_pos",ascending=False)["Text"].values[500]

# save

In [None]:
model_directory = "C://Users/pushp/roberta_model"

# Save the tokenizer and model
tokenizer.save_pretrained(model_directory)
model.save_pretrained(model_directory)

# Optionally, you can also save the model configuration
model.config.save_pretrained(model_directory)