# Sentiment classification with transformers 

In [None]:
# data processing tools
import os
import csv
import urllib.request
import pandas as pd
from tqdm import tqdm

# maths tools
import numpy as np
from scipy.special import softmax

# Huggingface tools
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from transformers import pipeline, set_seed

## Choose task specific model

The ```twitter-roberta-base``` model has been finetuned on a number of slightly different but related sentiment-style tasks. Specific models are finetuned to predict the following labels:

- emotion
- hate
- irony
- offensive
- sentiment
- emoji

In [None]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

## Initialize tokenizer

We initalize a pretrained tokenizer, which we need to use to tokenize our texts.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Download label mappings

This specific model requires us to download the labels that we're going to use separately - this isn't the case with every model. We'll see more on that below, under ```Pipelines```.

In [None]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

## Initialize model

In [None]:
# # TF
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

## Predict text

In [None]:
# text input    
text = "Oh, well, that sounds just great"

# encode using tokenizer
encoded_input = tokenizer(text, return_tensors='tf')

# get output
output = model(encoded_input)

# get outputs as numpy array
scores = output[0][0].numpy()

# perform softmax classification
scores = softmax(scores)

## Get ranked scores

In [None]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

## HuggingFace pipelines

Pipelines give us less control over the fine details but instead allow us to quickly generate results with default parameters.

In [None]:
classifier = pipeline("text-classification", 
                      model = "cardiffnlp/twitter-roberta-base-emotion", 
                      return_all_scores=True)
scores = classifier("you suck! 🤬")

In [None]:
scores

These results don't look great because of the problem with the labels that we saw earlier. We can obviously fix that afterwards by mapping the label names onto these results. 

Alternatively, we could use a different model that doesn't have these problems.

In [None]:
classifier = pipeline("text-classification",
                      model='bhadresh-savani/distilbert-base-uncased-emotion', 
                      return_all_scores=True)
prediction = classifier("you suck! 🤬")

In [None]:
all_scores = []
for idx, row in tqdm(data.iterrows()):
    prediction = classifier(row["text"])
    all_scores.append((row["text"], prediction))

In [None]:
classifier = pipeline("sentiment-analysis")
classifier("I loved Star Wars so much!")

In [None]:
prediction

## Score dataframe

In [None]:
filename = os.path.join("..","..","CDS-LANG", "reviews", "yelp_labelled.txt")

data = pd.read_table(filename, header=None, names=["text", "label"])

## Text generation

In [None]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", 
          max_length=50, 
          num_return_sequences=5)