In [1]:
import numpy as np 
import pandas as pd 
import sys 
from pathlib import Path 
sys.path.append(str(Path.cwd().parent))
from Tools.sentiment_analysis import extract_sentiment_score
from Tools.sentiment_analysis import get_label

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment were not used when initializing RobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pred

## Evaluation Roberta model and Textblob model 

### 1. load the external data from Kaggle 

#### About this file

This is the sentiment140 dataset.
It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 2 = neutral, 4 = positive) and they can be used to detect sentiment .
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
ids: The id of the tweet ( 2087)
date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
flag: The query (lyx). If there is no query, then this value is NO_QUERY.
user: the user that tweeted (robotickilldozr)
text: the text of the tweet (Lyx is cool)
The official link regarding the dataset with resources about how it was generated is here
The official paper detailing the approach is here

According to the creators of the dataset:

"Our approach was unique because our training data was automatically created, as opposed to having humans manual annotate tweets. In our approach, we assume that any tweet with positive emoticons, like :), were positive, and tweets with negative emoticons, like :(, were negative. We used the Twitter Search API to collect these tweets by using keyword search"

citation: Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.

In [55]:
df_k = pd.read_csv('../../csv_files/kaggle_tweets.csv')

In [56]:
# modify to form with attribute label and text 
df_k = df_k[df_k.columns[[0, -1]]]
df_k.columns = ['target', 'text']
df_k['label'] = df_k['target'].map({0 : 'Negative', 2 : 'Neutral', 4 : 'Positive'})
df_k.head()

Unnamed: 0,target,text,label
0,0,is upset that he can't update his Facebook by ...,Negative
1,0,@Kenichan I dived many times for the ball. Man...,Negative
2,0,my whole body feels itchy and like its on fire,Negative
3,0,"@nationwideclass no, it's not behaving at all....",Negative
4,0,@Kwesidei not the whole crew,Negative


### 2. Load the external data from SemEval-2017 Task 4: Sentiment Analysis in Twitter

#### About the paper 

This paper describes the fifth year of the Sentiment Analysis in Twitter task. SemEval-2017 Task 4 continues with a rerun of the subtasks of SemEval-2016 Task 4, which include identifying the overall sentiment of the tweet, sentiment towards a topic with classification on a two-point and on a five-point ordinal scale, and quantification of the distribution of sentiment towards a topic across a number of tweets: again on a two-point and on a five-point ordinal scale. Compared to 2016, we made two changes: (i) we introduced a new language, Arabic, for all subtasks, and (ii) we made available information from the profiles of the Twitter users who posted the target tweets. The task continues to be very popular, with a total of 48 teams participating this year.

https://alt.qcri.org/semeval2017/task4/index.php?id=papers

#### About the file 

link download full data: https://alt.qcri.org/semeval2017/task4/?id=download-the-full-training-data-for-semeval-2017-task-4

Subtask A: Message Polarity Classification.
Given a message, classify whether the message is of positive, negative, or neutral sentiment.
The training set are manually annotated 

We only use the data of 2016-train

In [3]:
df = pd.read_table('../../csv_files/twitter-2016train-A.txt')
df.head()

Unnamed: 0,628949369883000832,negative,"dear @Microsoft the newOoffice for Mac is great and all, but no Lync update? C'mon."
0,628976607420645377,negative,@Microsoft how about you make a system that do...
1,629023169169518592,negative,I may be ignorant on this issue but... should ...
2,629179223232479232,negative,"Thanks to @microsoft, I just may be switching ..."
3,629186282179153920,neutral,If I make a game as a #windows10 Universal App...
4,629226490152914944,positive,"Microsoft, I may not prefer your gaming branch..."


In [4]:
# df.columns = ['id', 'topic', 'label', 'text']
df.columns = ['id', 'label', 'text']

In [5]:
df.drop(columns='id', inplace=True)
df['label'] = df['label'].map({'negative' : 'Negative', 'positive' : 'Positive', 'neutral' : 'Neutral'})

### 3. Evaluation Roberta model 

In [10]:
def random_check(n_check, n_samp, df):
    """
    do n_check loop on n_samp random observations from the df. get the prediction accuracy for each loop
    then get average over all loop
    """
    lst = []
    lst_TF = []
    count = 0
    for _ in range(n_check):
        df_light = df.sample(n=n_samp)
        lst_label = []
        lst_score = []
        for text in df_light['text']:  # loop through each row to get the prediction label and prediction score 
            count += 1
            if count%100 == 0:
                print(count)
            tup = extract_sentiment_score(text)
            lst_label.append(tup[0])
            lst_score.append(tup[1])
        df_light['prediction'] = lst_label
        df_light['sentiment_score'] = lst_score
        lst.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))
        
        # Calculate TP/(TP + FP)
        df_light = df_light[df_light.prediction=='Positive'] 
        lst_TF.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))
    return sum(lst)/len(lst), sum(lst_TF)/len(lst_TF)
random_check(n_check=5, n_samp=100, df=df)

100
200
300
400
500


(0.664, 0.8709007659007659)

### 4. Evaluation Textblob 

In [9]:
def random_check_tb(n_check, n_samp, df):
    """
    But for Textblob
    """
    lst = []
    lst_TF = []
    count = 0
    for _ in range(n_check):
        df_light = df.sample(n=n_samp)
        df_light['prediction'] = df_light['text'].apply(get_label)
        lst.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))
        
        # Calculate TP/(TP + FP)
        df_light = df_light[df_light.prediction=='Positive'] 
        lst_TF.append(sum(df_light['label'] == df_light['prediction'])/len(df_light))
    return sum(lst)/len(lst), sum(lst_TF)/len(lst_TF)
random_check_tb(n_check=100, n_samp=100, df=df)    

(0.4650000000000001, 0.6088379640446522)