In [8]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset

plt.style.use('ggplot')

import nltk
from nltk.tokenize import word_tokenize
import re

# preparing input to our model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm ## to add progress bars to loops and iterations

In [3]:
# download dataset from hungging face
dataset = load_dataset("tweet_eval", "sentiment")

dataset

Found cached dataset tweet_eval (C:/Users/Splute/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
data_test = pd.DataFrame(dataset['test'])

# Balance the amount of each class
SEED = 66
num_test_per_class = 2000
balanced_test_data = data_test.groupby('label', group_keys=False).apply(lambda x:\
        x.sample(min(len(x), num_test_per_class), random_state=SEED)).sample(frac=1, random_state=SEED)
# X_test = balanced_test_data.text.tolist()

balanced_test_data.head()
# X_test.label.value_counts()

Unnamed: 0,text,label
1077,Wait till this guy finds out Trump cut his Med...,0
9061,"@user Have a listen, these are the feelings of...",0
8577,The best #BlackFriday purchase! #TheWalkingDea...,2
11261,#Westworld is the greatest work of art I've ev...,2
8938,Barak Obama | highlighting the many broken pro...,0


In [16]:
df = balanced_test_data
# # 添加一个新的整数索引，并将其保存为'id'列
# df['id'] = df.reset_index(drop=True).index
# # 将'id'列移动到最左边
# id_column = df['id']  # 获取'id'列
# df.drop(columns=['id'], inplace=True)  # 删除'id'列
# df.insert(0, 'id', id_column)  # 将'id'列插入到第一列
# df

## The accuracy of VADER

In [17]:
sia = SentimentIntensityAnalyzer()

In [21]:
# run on the entire dataset
right_num = 0
total_num = 3*num_test_per_class
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    label = row['label']
    res = sia.polarity_scores(text)
    # 使用max函数结合字典的items()方法找到值最大的那组数据的索引和值
    max_index, max_value = max(enumerate(res.items()), key=lambda x: x[1][1])
    # print(max_index, max_value)
    # print(label)
    if max_index==label:
        right_num+=1
print(right_num/total_num)

# vaders = pd.DataFrame(res).T
# vaders = vaders.reset_index().rename(columns={'index':'id'})
# vaders = vaders.merge(df, how='left')
# vaders.head()

  0%|          | 0/6000 [00:00<?, ?it/s]

0.326


## Accuracy of roBerta

In [22]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [23]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [27]:
example = 'How would you feel, if I... if I gave you your copy in person?'
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores.argmax()

1

In [28]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    label = scores.argmax()

    return label

In [29]:
# run on the entire dataset
right_num = 0
total_num = 3*num_test_per_class
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    label = row['label']
    res = polarity_scores_roberta(text)
    if res==label:
        right_num+=1
print(right_num/total_num)

  0%|          | 0/6000 [00:00<?, ?it/s]

0.7081666666666667
