In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pre processing
import regex as re
import string
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model

#
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel, TFAutoModelForSequenceClassification

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

2024-02-04 08:26:05.165849: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-04 08:26:05.576458: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-04 08:26:05.576579: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-04 08:26:05.618122: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-04 08:26:05.721650: I tensorflow/core/platform/cpu_feature_guar

In [3]:
#set style for plots
sns.set_style('white')
sns.despine()
#plt.style.use('seaborn-whitegrid')
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 640x480 with 0 Axes>

In [4]:
df= pd.read_csv('../dataset/final.csv')
df.head()

Unnamed: 0,sentiment,tweet
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,0.0,My cat only chews @apple cords. Such an #Apple...
3,0.0,I agree with @jimcramer that the #IndividualIn...
4,0.0,Nobody expects the Spanish Inquisition #AAPL


In [5]:
df.iloc[0].tweet

'#AAPL:The 10 best Steve Jobs emails ever...http://t.co/82G1kL94tx'

### Deep Data Cleaning

In [6]:
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [7]:
texts_new = []
for t in df.tweet:
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

In [8]:
df['clean_text']=texts_new
df.head()

Unnamed: 0,sentiment,tweet,clean_text
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl


In [9]:
text_len = []
for text in df.clean_text:
    tweet_len = len(text.split())
    text_len.append(tweet_len)

In [10]:
df['text_len']=text_len
df[['sentiment', 'clean_text']].to_csv('../dataset/data.csv', index=False)

In [11]:
df.iloc[0].clean_text

'aaplthe 10 best steve jobs emails ever'

In [12]:
df.head()

Unnamed: 0,sentiment,tweet,clean_text,text_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,nobody expects the spanish inquisition aapl,6


### Balancing the dataset

In [13]:
df['sentiment'].value_counts()

sentiment
 0.0    3676
-1.0    2235
 1.0     704
Name: count, dtype: int64

In [68]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['clean_text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['clean_text', 'sentiment'])
train_os['sentiment'].value_counts()

sentiment
 0.0    3676
 1.0    3676
-1.0    3676
Name: count, dtype: int64

In [105]:
train_os = train_os[train_os.clean_text != '']
train_os.to_csv('../dataset/data.csv', index=None)

                                              clean_text  sentiment
0                 aaplthe 10 best steve jobs emails ever        0.0
1      rt why aapl stock had a miniflash crash today ...        0.0
2              my cat only chews cords such an applesnob        0.0
3      i agree with that the individualinvestor shoul...        0.0
4            nobody expects the spanish inquisition aapl        0.0
...                                                  ...        ...
11023  gotta love the genius bar thanks for your help...        1.0
11024                                       rt thank you        1.0
11025     iphone6 plus grabs 41 of us phablet sales aapl        1.0
11026  rt a3 yes and are volleying w quicker quicker ...        1.0
11027  aaplapple executives to take stand in antitrus...        1.0

[11009 rows x 2 columns]


### train-validation dataset split

In [113]:
from datasets import load_dataset

dataset = load_dataset("csv", split='train', data_files="../dataset/data.csv")
print(dataset['clean_text'][5758])

None


### Tokenizing

In [111]:
X_train, X_valid, y_train, y_valid = train_test_split(dataset['clean_text'], dataset['sentiment'], test_size=0.1, random_state=42)

print(X_train)



In [108]:
tokenizer= RobertaTokenizerFast.from_pretrained("roberta-base")

In [109]:
def tokenize(examples):
    token = tokenizer(examples['clean_text'], truncation=True, padding="max_length")
    return token

train_encoding = dataset.map(tokenize)

Map:  52%|█████▏    | 5758/11028 [00:00<00:00, 6404.39 examples/s]


ValueError: You need to specify either `text` or `text_target`.

### RoBERTa Sentiment Analysis

### Building Model

In [75]:
roberta_model = TFAutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## PPLX rocks


In [79]:

# Freeze pre-trained layers
for layer in roberta_model.layers:
    print(layer)
    layer.trainable = False

print(roberta_model.summary())
# Add new layers for your specific task

# print(model.summary())
# model.build(X_train.shape)

# print(model.summary())

# print(roberta_model.summary())
# Compile the model
# model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

# Train the model
# model.fit(X_train, epochs=10, validation_data=X_valid)


<transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer object at 0x7982fb150190>
<transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead object at 0x7982e8112c10>
Model: "tf_roberta_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLaye  multiple                  124055040 
 r)                                                              
                                                                 
 classifier (TFRobertaClass  multiple                  592899    
 ificationHead)                                                  
                                                                 
Total params: 124647939 (475.49 MB)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 124647939 (475.49 MB)
_________________________________________________________________
None
