In [15]:
import os
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import ast
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
print(pipeline('sentiment-analysis')('we love you'))
print(pipeline('sentiment-analysis')('we hate you'))

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998704195022583}]
[{'label': 'NEGATIVE', 'score': 0.9988259673118591}]


In [3]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [4]:
encoded_text = tokenizer('very good movie!', return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.0018993656, 'roberta_neu': 0.013890146, 'roberta_pos': 0.9842106}


In [5]:
def polarity_scores_roberta(example):
    try:
        encoded_text = tokenizer(example, return_tensors='pt')
        output = model(**encoded_text)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        scores_dict = {
            'roberta_neg' : scores[0],
            'roberta_neu' : scores[1],
            'roberta_pos' : scores[2]
        }
    except:
        scores_dict = {
            'roberta_neg' : 0,
            'roberta_neu' : 0,
            'roberta_pos' : 0
        }
    return scores_dict

In [13]:
def read_return_file(folder_path):
    files_path = os.listdir(folder_path)
    files_path.sort()
    data_path_daily = ['{}{}'.format(folder_path,file) for file in files_path]
    data_path_daily = pd.DataFrame(data_path_daily,columns=['File Path'])
    data_path_daily['File Name'] = files_path
    return data_path_daily['File Path']

In [7]:
for i,file in tqdm(enumerate(read_return_file('Datasets/Processed Tweet/'))):
    try:
        data = pd.read_csv(file,engine='python',index_col=0)
        data['sent_score']= data['clean_tweet'].astype('str').progress_apply(polarity_scores_roberta)
        data.to_csv('{}'.format(file))
    except:
        print(i)
        print(file)
        continue

100%|██████████| 19966/19966 [32:58<00:00, 10.09it/s]
100%|██████████| 20128/20128 [31:36<00:00, 10.61it/s]
100%|██████████| 20582/20582 [32:39<00:00, 10.51it/s]
100%|██████████| 20409/20409 [32:51<00:00, 10.35it/s]
100%|██████████| 20406/20406 [33:52<00:00, 10.04it/s]
5it [2:44:03, 1968.65s/it]


In [27]:
for i,file in tqdm(enumerate(read_return_file('Datasets/Processed Tweet/'))):
    data = pd.read_csv(file,engine='python',index_col=0)
    if 'POS' not in data.columns:
        try:
            data['sent_score'] = data['sent_score'].apply(lambda x: ast.literal_eval(x))
            data['POS'] = data['sent_score'].apply(lambda x: x['roberta_pos'])
            data['NEU'] = data['sent_score'].apply(lambda x: x['roberta_neu'])
            data['NEG'] = data['sent_score'].apply(lambda x: x['roberta_neg'])
            data.to_csv('{}'.format(file))
        except:
            print(i)
            print(file)
            continue
    else:
        continue

44it [00:16,  2.71it/s]
