In [None]:
import pandas as pd

# Step 0: Data preparation

In [None]:
df = pd.read_csv('reviews.csv')
print(df.describe())
df.head()

In [None]:
df = df[['Id', 'Text', 'Score']]
df.head()

In [None]:
# Check for null values in 'Score' column
score_null_count = df['Score'].isnull().sum()
if score_null_count > 0:
    print(f"Number of null values in 'Score' column: {score_null_count}")
else:
    print("No null values found in 'Score' column")

# Check for null values in 'Text' column
text_null_count = df['Text'].isnull().sum()
if text_null_count > 0:
    print(f"Number of null values in 'Text' column: {text_null_count}")
else:
    print("No null values found in 'Text' column")

In [None]:
# Check for broken data in 'Score' column
broken_data = df[(df['Score'] < 1) | (df['Score'] > 5)]
if len(broken_data) > 0:
    print("Broken data found in 'Score' column:")
    print(broken_data)
else:
    print("No broken data found in 'Score' column")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Count the occurrences of each score
score_counts = df['Score'].value_counts()

# Create bar plot using seaborn
plt.figure(figsize=(8, 6))
sns.barplot(x=score_counts.index, y=score_counts.values)
plt.xlabel('Score')
plt.ylabel('Count')
plt.title('Count of Scores')
plt.show()

In [None]:
# Resample the data to have an equal count for each score
def resample_df(df, min_count):
    return pd.concat([df[df['Score'] == score].sample(min_count) for score in score_counts.index]) .reset_index()

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
df, test_df = train_test_split(df, test_size=0.2, random_state=42)
df = resample_df(df, 200)
test_df = resample_df(test_df, 50)

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Plot for resampled train data
axs[0].bar(df['Score'].value_counts().index, df['Score'].value_counts().values)
axs[0].set_xlabel('Score')
axs[0].set_ylabel('Count')
axs[0].set_title('Train Data')

# Plot for resampled test data
axs[1].bar(test_df['Score'].value_counts().index, test_df['Score'].value_counts().values)
axs[1].set_xlabel('Score')
axs[1].set_ylabel('Count')
axs[1].set_title('Test Data')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()

# Step 1: Sentiment analysis

### VADER approach

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
example = df['Text'][10]
example

In [None]:
sid.polarity_scores(example)

In [None]:
df['Score'][10]

In [None]:
vader_res = df['Text'].apply(lambda text: sid.polarity_scores(text))
df['vader_pos'] = vader_res.apply(lambda score: score['pos'])
df['vader_neg'] = vader_res.apply(lambda score: score['neg'])
df['vader_neu'] = vader_res.apply(lambda score: score['neu'])
df['vader_compound'] = vader_res.apply(lambda score: score['compound'])

df.head()

In [None]:
ax = sns.barplot(data=df, x='Score', y='vader_compound')
ax.set_title('Compound score by stars')
plt.show()

In [None]:
def plot_sentiment_results(model):
    fig, axs = plt.subplots(1, 3, figsize=(15, 5))
    sns.barplot(data=df, x='Score', y=f'{model}_pos', ax=axs[0])
    sns.barplot(data=df, x='Score', y=f'{model}_neu', ax=axs[1])
    sns.barplot(data=df, x='Score', y=f'{model}_neg', ax=axs[2])
    axs[0].set_title('Positive')
    axs[1].set_title('Neutral')
    axs[2].set_title('Negative')
    plt.show()

plot_sentiment_results('vader')

### RoBERTa pretrained model

In [None]:
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL = 'cardiffnlp/twitter-roberta-base-sentiment'

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

def polarity_scores_roberta(text):
    encoded_inputs = tokenizer(text, return_tensors='pt')
    logits = model(**encoded_inputs)

    scores = logits[0][0].detach().numpy()
    scores = softmax(scores)
    return {
        'roberta_neg': scores[0],
        'roberta_neu': scores[1],
        'roberta_pos': scores[2]
    }

In [None]:
from tqdm.notebook import tqdm

roberta_res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    row_id = row['Id']
    row_text = row['Text']
    try:
        score = polarity_scores_roberta(row_text)
        roberta_res[row_id] = score
    except RuntimeError:
        print(f'Error for id {row_id}')

In [None]:
results_df = pd.DataFrame(roberta_res).T
results_df = results_df.reset_index().rename(columns={'index': 'Id'})
df = df.merge(results_df, how='left')
df

In [None]:
plot_sentiment_results('roberta')