#**3_Sentiment Prediction Twitter/Amazon**
This notebook is used to evaluate the performance of two models from hugging face on two datasets that have been preprocessed by BERT on slang word replecement.

# Libraries

In [None]:
!pip install datasets transformers==4.28.0
!pip install --upgrade accelerate
!pip install cchardet

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from da

In [None]:
import pandas as pd
import numpy as np
import torch
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from tqdm import tqdm

# Parameters for the notebook

In [None]:
pd.set_option('display.max_colwidth', None)
# TO RUN LOCALLY OR ON COLAB
is_local = False

# Reading in preprocessed data

In [None]:
if is_local == False:
    from google.colab import drive
    drive.mount('/content/drive/')
    path_data = "/content/drive/My Drive/NLP_PROJECT/data/"
else:
    path_data = "D:/Google Drive/NLP_PROJECT/data/"

Mounted at /content/drive/


In [None]:
# datasets
df_amazon = pd.read_csv(path_data + "BERT_data/amazon_after_bert_small_colab.csv")
df_twitter = pd.read_csv(path_data + "BERT_data/twitter_after_bert_small_colab.csv")

# shapes
print("df_amazon shape:", df_amazon.shape)
print("df_twitter shape:", df_twitter.shape)
# column names
print("columns:", list(df_amazon.columns))

df_amazon shape: (56697, 8)
df_twitter shape: (56484, 8)
columns: ['id', 'true_sentiment', 'text', 'candidates', 'updated_candidates', 'processed_text', 'is_same_as_original', 'chosen_translation']


In [None]:
# pick only rows were slang was replaced
df_amazon = df_amazon[df_amazon["is_same_as_original"] == False]
df_twitter = df_twitter[df_twitter["is_same_as_original"] == False]

In [None]:
def save_results(df, file_name):
    df.to_csv(path_data + "BERT_data/" + file_name + ".csv", index=False)

# Models


### **[Twitter dataset](https://www.kaggle.com/datasets/gogylogy/twitterslang)** ➡️ binary sentiment classification with **[sentiment-roberta-large-english](https://huggingface.co/siebert/sentiment-roberta-large-english?text=should%27ve+been+asleep+two+hours+ago)**

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("device:", device)

# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
trainer = Trainer(model=model)

device: cuda


(…)glish/resolve/main/tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

(…)a-large-english/resolve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

(…)ta-large-english/resolve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

(…)ta-large-english/resolve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

(…)ish/resolve/main/special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
# adjusted from https://huggingface.co/siebert/sentiment-roberta-large-english
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts

    def __len__(self):
        return len(self.tokenized_texts["input_ids"])

    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

def make_predictions_roberta(df_original, text_for_prediction):
    pred_texts = df_original[text_for_prediction].dropna().astype('str').tolist()
    # Tokenize texts and create prediction data set
    tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
    pred_dataset = SimpleDataset(tokenized_texts)
    # Run predictions
    predictions = trainer.predict(pred_dataset)
    # Transform predictions to labels
    preds = predictions.predictions.argmax(-1)
    labels = pd.Series(preds).map(model.config.id2label)
    scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
    # Create DataFrame with texts, predictions, labels, and scores
    df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)),
                      columns=['passed_text', 'predicted_sentiment',
                               'label', 'sentiment_score'])
    # merge predictions with original df
    df = df.merge(df_original, left_on='passed_text', right_on=text_for_prediction, how='left')
    # keep only necessary columns
    df = df[['id', 'true_sentiment', 'text', 'chosen_translation', 'processed_text', 'is_same_as_original', 'predicted_sentiment']]
    return df

#### Predictions for a text without slang replacement 🚫

In [None]:
df_twitter_pred_roberta_wo_slang = make_predictions_roberta(df_twitter, text_for_prediction = 'text')
save_results(df_twitter_pred_roberta_wo_slang, "twitter_roberta_wo_slang")

#### Predictions for a text with slang replacement ↪️

In [None]:
df_twitter_pred_roberta_with_slang = make_predictions_roberta(df_twitter, text_for_prediction = 'processed_text')
save_results(df_twitter_pred_roberta_with_slang, "twitter_roberta_with_slang")

### **[Amazon Product Review dataset](https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews)** ➡️ 1-5 classification with **[bert-base-multilingual-uncased-sentiment](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)**

In [None]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print("device:", device)

# load the tokenizer and the model
tokenizer_bert = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model_bert = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment").to(device)

device: cuda


(…)iment/resolve/main/tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

(…)cased-sentiment/resolve/main/config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

(…)uncased-sentiment/resolve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

(…)ent/resolve/main/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

In [None]:
# adjusted from https://medium.com/@johnthuo/developing-a-multilingual-sentiment-analysis-tool-integrated-with-openai-api-950b428af446
def generate_sentiment_score_bert(review):
    tokens = tokenizer_bert.encode(review, return_tensors='pt').to(device)
    if tokens.shape[1] > 490: return ''
    result = model_bert(tokens)
    return int(torch.argmax(result.logits) + 1)

def make_predictions_bert(df, text_for_prediction):
    df_predictions = pd.DataFrame(columns=['id', 'true_sentiment',
                                           'text', 'chosen_translation',
                                           'processed_text',
                                           'is_same_as_original',
                                           'predicted_sentiment'])
    with tqdm(total=len(df)) as pbar:
        for index, row in df.iterrows():
            prediction = generate_sentiment_score_bert(row[text_for_prediction])
            if prediction == '':
                # print("row is skipped")
                continue
            df_predictions.loc[len(df_predictions)] = {
                                                        'id': index,
                                                        'true_sentiment': row['true_sentiment'],
                                                        'text': row['text'],
                                                        'chosen_translation': row['chosen_translation'],
                                                        'processed_text': row['processed_text'],
                                                        'is_same_as_original': row['is_same_as_original'],
                                                        'predicted_sentiment': prediction
                                                    }
            pbar.update(1)
    return df_predictions

#### Predictions for a text without slang replacement 🚫

In [None]:
df_amazon_pred_bert_wo_slang = make_predictions_bert(df_amazon, text_for_prediction='text')
save_results(df_amazon_pred_bert_wo_slang, "amazon_bert_wo_slang")

 99%|█████████▉| 3777/3809 [01:05<00:00, 57.95it/s]


&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;---> it was fully processed, the progress bar lagged at the end

#### Predictions for a text with slang replacement ↪️

In [None]:
df_amazon_pred_bert_with_slang = make_predictions_bert(df_amazon, text_for_prediction = 'processed_text')
save_results(df_amazon_pred_bert_with_slang, "amazon_bert_with_slang")

 99%|█████████▉| 3776/3809 [01:05<00:00, 57.87it/s]


&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;---> it was fully processed, the progress bar lagged at the end