In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import numpy as np
import pandas as pd
import sqlite3
import os 
import re
from preprocessing import preprocessing_text
from typing import Tuple, Dict, List
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 


#### 1. import the Roberta Model

roberta = 'cardiffnlp/twitter-roberta-base-sentiment'

model = AutoModelForSequenceClassification.from_pretrained(roberta)

tokenizer = AutoTokenizer.from_pretrained(roberta)

#### 2. Preprocessing text

from preprocessing import preprocessing_text

#### 3. Functions to extract sentiment scores 

def extract_sentiment_score(text: str) -> Tuple:
    """
    extract the sentiment score for a input text
    ----------
    param: text
    return: return tuple (labels, polarity)
        labels: can be ['Negative', 'Neutral', 'Positive']
        polarity: can be between 0 and 1
    """
    # preprocessing
    text = preprocessing_text(text)
    #extract score
    labels = ['Negative', 'Neutral', 'Positive']
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = softmax(output[0][0].detach().numpy())
    ranking = np.argsort(scores)[-1]
    l = labels[ranking]
    a = np.array([-1, 0, 1])
    polarity = sum([i*j for i, j in zip(scores, a)])
    return (l, polarity)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pred

In [55]:
df = pd.read_csv('../../csv_files/fuck_u.csv')

In [56]:
df = df[df.columns[[0, -1]]]

In [57]:
df.columns = ['target', 'text']

In [58]:
df['label'] = df['target'].map({0 : 'Negative', 2 : 'Neutral', 4 : 'Positive'})

In [63]:
lst = []
for _ in range(10):
    df_light = df.sample(n=100)
    lst_label = []
    lst_score = []
    count = 0
    for text in df_light['text']:
        count += 1
        if count == 100 or count == 500 or count == 1000 or count == 2000:
            print(count)
        tup = extract_sentiment_score(text)
        lst_label.append(tup[0])
        lst_score.append(tup[1])
    df_light.loc[:, 'prediction'] = lst_label
    df_light.loc[:, 'sentiment_score'] = lst_score
    df_light = df_light[df_light['prediction'] == 'Positive']
    lst.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))

100
100
100
100
100
100
100
100
100
100


In [64]:
sum(lst)/len(lst)

0.7542794927041152

In [54]:
sum(df_light['label'] == df_light['prediction'])/len(df_light)

0.52

In [85]:
df = pd.read_table('../../csv_files/ok_ok.txt')

In [86]:
df.columns = ['id', 'label', 'text']

In [87]:
df.drop(columns='id', inplace=True)

In [88]:
df['label'] = df['label'].map({'negative' : 'Negative', 'positive' : 'Positive', 'neutral' : 'Neutral'})
df.head()

Unnamed: 0,label,text
0,Negative,Theo Walcott is still shit\u002c watch Rafa an...
1,Negative,its not that I\u2019m a GSP fan\u002c i just h...
2,Negative,Iranian general says Israel\u2019s Iron Dome c...
3,Neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
4,Neutral,I sat through this whole movie just for Harry ...


In [91]:
lst = []
for _ in range(10):
    df_light = df.sample(n=100)
    lst_label = []
    lst_score = []
    count = 0
    for text in df_light['text']:
        count += 1
        if count == 100 or count == 500 or count == 1000 or count == 2000:
            print(count)
        tup = extract_sentiment_score(text)
        lst_label.append(tup[0])
        lst_score.append(tup[1])
    df_light.loc[:, 'prediction'] = lst_label
    df_light.loc[:, 'sentiment_score'] = lst_score
#     df_light = df_light[df_light['prediction'] == 'Positive']
    lst.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))

100
100
100
100
100
100
100
100
100
100


In [92]:
sum(df_light['label'] == df_light['prediction'])/len(df_light)

0.75