In [None]:
import pandas as pd
from string import Template
from pathlib import Path

import warnings
warnings.simplefilter("ignore")

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
!wget 'https://www.kaggle.com/code/ghrangel/emotion-detection-with-llm/input?select=tweet_emotions.csv'

--2024-02-17 01:13:00--  https://www.kaggle.com/code/ghrangel/emotion-detection-with-llm/input?select=tweet_emotions.csv
Resolving www.kaggle.com (www.kaggle.com)... 35.244.233.98
Connecting to www.kaggle.com (www.kaggle.com)|35.244.233.98|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘input?select=tweet_emotions.csv’

          input?sel     [<=>                 ]       0  --.-KB/s               input?select=tweet_     [ <=>                ]  18.33K  --.-KB/s    in 0.02s   

2024-02-17 01:13:00 (875 KB/s) - ‘input?select=tweet_emotions.csv’ saved [18765]



In [None]:
llm = 'google/flan-t5-base'

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
model = T5ForConditionalGeneration.from_pretrained(llm).to(device)
tokenizer = T5Tokenizer.from_pretrained(llm)

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
test = pd.read_csv('/content/tweet_emotions.csv')
test.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
df_reduce = test.copy()

df_reduce['sentiment'] = df_reduce['sentiment'].replace(['happiness', 'enthusiasm', 'surprise'], 'fun')
df_reduce['sentiment'] = df_reduce['sentiment'].replace('boredom', 'sadness')
df_reduce['sentiment'] = df_reduce['sentiment'].replace('hate', 'anger')
df_reduce['sentiment'] = df_reduce['sentiment'].replace(['relief', 'empty'], 'neutral')

In [None]:
df_reduce.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,neutral,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,fun,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [None]:
df_reduce.sentiment.unique()

array(['neutral', 'sadness', 'fun', 'worry', 'love', 'anger'],
      dtype=object)

In [None]:
preamble = \
    'Find the most appropriate sentiment with the letters A, B, C, D, E and F. '

template = Template('$preamble\n\n$prompt\n\nA) $a\nB) $b\nC) $c\nD) $d\nE) $e\nF) $f')

In [None]:
def format_input(df, idx=-1,prompt=None):
    if idx!=-1:
      prompt = df.loc[idx, 'content']
    a = 'neutral'
    b = 'sadness'
    c = 'fun'
    d = 'worry'
    e = 'love'
    f = 'anger'

    input_text = template.substitute(
        preamble=preamble, prompt=prompt, a=a, b=b, c=c, d=d, e=e, f=f)

    return input_text

In [None]:
val=3

In [None]:
print(format_input(df_reduce, val))

Find the most appropriate sentiment with the letters A, B, C, D, E and F. 

wants to hang out with friends SOON!

A) neutral
B) sadness
C) fun
D) worry
E) love
F) anger


In [None]:
def generate(tokenizer, model, format_input, df_reduce, val=-1, prompt=None, device=device):
    inputs = tokenizer(format_input(df_reduce, val, prompt), return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    if answer == ['A']:
        ans = 'neutral'
    elif answer == ['B']:
        ans = 'sadness'
    elif answer == ['C']:
        ans = 'fun'
    elif answer == ['D']:
        ans = 'worry'
    elif answer == ['E']:
        ans = 'love'
    elif answer == ['F']:
        ans = 'anger'

    return 'emotion expressesed:' + ans


In [None]:
prompt='I love reading novels!'
generate(tokenizer, model, format_input, df_reduce, prompt=prompt, device=device)

'emotion expressesed:love'

In [None]:
prompt='She danced in the rain, splashing in puddles.'
generate(tokenizer, model, format_input, df_reduce, prompt=prompt, device=device)

'emotion expressesed:fun'

In [None]:
prompt='The delicious aroma of freshly baked cookies filled the kitchen.'
generate(tokenizer, model, format_input, df_reduce, prompt=prompt, device=device)

'emotion expressesed:love'

In [None]:
prompt='He frowned as he struggled to solve the puzzle.'
generate(tokenizer, model, format_input, df_reduce, prompt=prompt, device=device)

'emotion expressesed:worry'