In [69]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset

plt.style.use('ggplot')

import nltk
from nltk.stem import WordNetLemmatizer
from copy import deepcopy

from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm ## to add progress bars to loops and iterations

import time

In [2]:
# download dataset from hungging face
dataset = load_dataset("tweet_eval", "sentiment")

dataset

Found cached dataset tweet_eval (C:/Users/Splute/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [15]:
data_test = pd.DataFrame(dataset['test'])

# Balance the amount of each class
SEED = 66
num_test_per_class = 2000
balanced_test_data = data_test.groupby('label', group_keys=False).apply(lambda x:\
        x.sample(min(len(x), num_test_per_class), random_state=SEED)).sample(frac=1, random_state=SEED)
# X_test = balanced_test_data.text.tolist()

balanced_test_data.head()
# X_test.label.value_counts()

Unnamed: 0,text,label
1077,Wait till this guy finds out Trump cut his Med...,0
9061,"@user Have a listen, these are the feelings of...",0
8577,The best #BlackFriday purchase! #TheWalkingDea...,2
11261,#Westworld is the greatest work of art I've ev...,2
8938,Barak Obama | highlighting the many broken pro...,0


In [44]:
df = balanced_test_data
y_test = df['label'].to_numpy().astype(int)
y_test[:5]
# # 添加一个新的整数索引，并将其保存为'id'列
# df['id'] = df.reset_index(drop=True).index
# # 将'id'列移动到最左边
# id_column = df['id']  # 获取'id'列
# df.drop(columns=['id'], inplace=True)  # 删除'id'列
# df.insert(0, 'id', id_column)  # 将'id'列插入到第一列
# df

array([0, 0, 2, 2, 0])

## Test of VADER

In [19]:
sia = SentimentIntensityAnalyzer()

In [57]:
test = sia.polarity_scores('I am very happy')
print(test)

{'neg': 0.0, 'neu': 0.429, 'pos': 0.571, 'compound': 0.6115}


In [71]:
vader_res = []
start_time = time.time()
for _, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    res = sia.polarity_scores(text)
    compound_score = res['compound']
    # 根据'compound'得分返回整数标签
    if compound_score >= 0.35:
        label = 2  # 正面情感
    elif compound_score <= -0.35:
        label = 0  # 负面情感
    else:
        label = 1  # 中性情感
    vader_res.append(label)
    # break
elapsed_time = time.time() - start_time

print(classification_report(y_test, vader_res))
print("Time: {:.3f}s".format(elapsed_time))

    

  0%|          | 0/6000 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.71      0.43      0.53      2000
           1       0.46      0.63      0.53      2000
           2       0.62      0.64      0.63      2000

    accuracy                           0.57      6000
   macro avg       0.60      0.57      0.57      6000
weighted avg       0.60      0.57      0.57      6000

Time: 0.949s


## Text of roBerta

In [72]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [73]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [74]:
example = 'How would you feel, if I... if I gave you your copy in person?'
encoded_text = tokenizer(example, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores.argmax()

1

In [75]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    label = scores.argmax()

    return label

In [77]:
# run on the entire dataset
roBerta_res = []
start_time = time.time()
for i, row in tqdm(df.iterrows(), total=len(df)):
    text = row['text']
    res = polarity_scores_roberta(text)
    roBerta_res.append(res)

elapsed_time = time.time() - start_time

print(classification_report(y_test, roBerta_res))
print("Time: {:.3f}s".format(elapsed_time))

  0%|          | 0/6000 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.68      0.87      0.77      2000
           1       0.62      0.55      0.58      2000
           2       0.84      0.70      0.77      2000

    accuracy                           0.71      6000
   macro avg       0.71      0.71      0.70      6000
weighted avg       0.71      0.71      0.70      6000

Time: 447.461s


## Test of BiLSTM

In [78]:
# import word vectors

words = dict()
wird_filepath = 'embeddings\glove.twitter.27B\glove.twitter.27B.100d.txt'

def add_to_dict(d, filename):
    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.split(' ')

            try:
                d[line[0]] = np.array(line[1:],dtype=float)
            except:
                continue

add_to_dict(words, wird_filepath)

In [79]:
len(words)

1193514

In [80]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Splute\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [81]:
# Use regular expressions to define word segmentation rules,
# and split text according to patterns that match regular expressions
tokenizer = nltk.RegexpTokenizer(r"\w+")

# find the root-word
lemmatizer = WordNetLemmatizer()

lemmatizer.lemmatize('feet')

# preprocess
def msg_to_token_list(string):
    tokens = tokenizer.tokenize(string)
    lowercased_tokens = [token.lower() for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in lowercased_tokens]
    useful_tokens = [token for token in lemmatized_tokens if token in words]

    return useful_tokens

In [82]:
def msg_to_word_vectors(msg, word_dict = words):
    processed_list_tokens = msg_to_token_list(msg)

    vectors = []

    for token in processed_list_tokens:
        if token not in word_dict:
            continue

        token_vector = word_dict[token]
        vectors.append(token_vector)

    return np.array(vectors, dtype=float)

In [83]:
msg_to_word_vectors('@#Did you feel happy?').shape


(4, 100)

In [84]:
# get x and y
def dff_to_X_y(dff):
    y = dff['label'].to_numpy().astype(int)

    all_word_vector_sequences = []

    for msg in dff['text']:
        msg_as_vector_seq = msg_to_word_vectors(msg)

        if msg_as_vector_seq.shape[0] == 0:
            msg_as_vector_seq = np.zeros(shape=(1,100))

        all_word_vector_sequences.append(msg_as_vector_seq)

    return all_word_vector_sequences, y


In [85]:
def pad_X(X, desired_sequence_length = 38):
    X_copy = deepcopy(X) # create a totally new copy

    for i, x in enumerate(X):
        x_seq_len = x.shape[0]
        sequence_length_difference = desired_sequence_length - x_seq_len

        pad = np.zeros(shape=(sequence_length_difference, 100))

        X_copy[i] = np.concatenate([x,pad])

    return np.array(X_copy).astype(float)

In [86]:
X_test, y_test = dff_to_X_y(df)
X_test = pad_X(X_test)

X_test.shape,y_test.shape

((6000, 38, 100), (6000,))

In [88]:
best_model = load_model('models/GloVe_LSTM')

start_time = time.time()
predictions = best_model.predict(X_test)
biLSTM_res = []
for pred in predictions:
  biLSTM_res.append(pred.argmax())

elapsed_time = time.time() - start_time

print(classification_report(y_test, biLSTM_res))
print("Time: {:.3f}s".format(elapsed_time))

              precision    recall  f1-score   support

           0       0.71      0.63      0.67      2000
           1       0.51      0.65      0.57      2000
           2       0.75      0.65      0.70      2000

    accuracy                           0.64      6000
   macro avg       0.66      0.64      0.64      6000
weighted avg       0.66      0.64      0.64      6000

Time: 2.052s


## The end