In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pre processing
import regex as re
import string
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler


import tensorflow as tf
from tensorflow import keras


#transformers
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
#set style for plots
sns.set_style('white')
sns.despine()
#plt.style.use('seaborn-whitegrid')
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 640x480 with 0 Axes>

In [4]:
df= pd.read_csv('./final.csv')
df.head()

Unnamed: 0,sentiment,tweet
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,0.0,My cat only chews @apple cords. Such an #Apple...
3,0.0,I agree with @jimcramer that the #IndividualIn...
4,0.0,Nobody expects the Spanish Inquisition #AAPL


In [5]:
df.iloc[0].tweet

'#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx'

### Deep Data Cleaning

In [6]:
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [7]:
texts_new = []
for t in df.tweet:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

In [8]:
df['clean_text']=texts_new
df.head()

Unnamed: 0,sentiment,tweet,clean_text
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl


In [9]:
text_len = []
for text in df.clean_text:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [10]:
df['text_len']=text_len
df.head()

Unnamed: 0,sentiment,tweet,clean_text,text_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl,6


In [11]:
df.iloc[0].tweet

'#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx'

In [12]:
df.head()

Unnamed: 0,sentiment,tweet,clean_text,text_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl,6


### Tokenizing

In [16]:
tokenizer_roberta = RobertaTokenizerFast.from_pretrained("roberta-base")

vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.16MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 6.56MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.40MB/s]
config.json: 100%|██████████| 481/481 [00:00<00:00, 1.26MB/s]


In [19]:
token_lens_test = []

for txt in df['clean_text'].values:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens_test.append(len(tokens))
    
max_len=np.max(token_lens_test)
print(f"MAX TOKENIZED SENTENCE LENGTH: {max_len}")


MAX TOKENIZED SENTENCE LENGTH: 51


In [22]:
token_len = []

for i,txt in enumerate(df['clean_text'].values):
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_len.append(len(tokens))

In [32]:
df['token_len'] = token_len
df.head()

Unnamed: 0,sentiment,tweet,clean_text,text_len,token_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever,7,12
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...,11,21
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob,8,12
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...,21,29
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl,6,13


In [34]:
df = df.sort_values(by='token_len', ascending=False)
df.head(10)

Unnamed: 0,sentiment,tweet,clean_text,text_len,token_len
6154,0.0,Japoneses acampan afuera de la tienda de @Appl...,japoneses acampan afuera de la tienda de en to...,20,51
5860,0.0,Que golazo el de @nokia_co frente al lanzamien...,que golazo el de frente al lanzamiento de los ...,20,48
5861,0.0,Sayin @apple inc telefonu ucuz yapicam diyip 5...,sayin inc telefonu ucuz yapicam diyip 550 dola...,17,45
5920,0.0,@myapple_pl no i tak ma zostac bo hejtuje tylk...,no i tak ma zostac bo hejtuje tylko ten co nig...,20,43
6189,0.0,@apple care protection is onzin voor veel geld...,care protection is onzin voor veel geld bij ma...,20,43
6251,0.0,Desaparecido el iPhone 5 de la Web de Apple (D...,desaparecido el iphone 5 de la web de apple de...,24,42
5896,0.0,"No, el 5S lo toi josiando! de apps que tengan ...",no el 5s lo toi josiando de apps que tengan mu...,22,41
5509,1.0,@Apple: You will be in my hands in just 8 days...,you will be in my hands in just 8 days iphone5...,16,41
5909,0.0,"Ik ga @HTC, @Samsung, @LG, @Sony, @Apple ontvo...",ik ga ontvolgen omdat ze verdommen een phone m...,19,40
5948,0.0,Si van a cambiar celu no lo hagan a iphone de ...,si van a cambiar celu no lo hagan a iphone de ...,24,40


In [35]:
df['sentiment'].value_counts()

sentiment
 0.0    3676
-1.0    2235
 1.0     704
Name: count, dtype: int64

### Balancing the dataset

In [43]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['clean_text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['clean_text', 'sentiment'])

In [45]:
train_os['sentiment'].value_counts()


sentiment
 0.0    3676
 1.0    3676
-1.0    3676
Name: count, dtype: int64

### train-validation dataset split

In [47]:
X = train_os['clean_text'].values
y = train_os['sentiment'].values

In [48]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

### RoBERTa Sentiment Analysis

In [49]:
MAX_LEN=128


In [50]:
def tokenize_roberta(data,max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

In [51]:
train_input_ids, train_attention_masks = tokenize_roberta(X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize_roberta(X_valid, MAX_LEN)
#test_input_ids, test_attention_masks = tokenize_roberta(X_test, MAX_LEN)


### Building Model

In [53]:
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

model.safetensors: 100%|██████████| 499M/499M [00:48<00:00, 10.2MB/s] 
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaModel: ['roberta.embeddings.position_ids', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing TFRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaModel were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this mo