In [None]:
!pip install transformers

In [1]:
#IMPORT LIBS
import tensorflow as tf
import numpy as np
from transformers import TFAutoModel,RobertaTokenizer, TFRobertaModel, RobertaTokenizer, RobertaModel
from tensorflow.keras.callbacks import EarlyStopping
from tqdm import tqdm
from sklearn.metrics import f1_score,confusion_matrix,accuracy_score,recall_score,precision_score,classification_report
from keras.models import load_model
import re
import pandas as pd

In [2]:
#GET RAW DATA
df = pd.read_csv('data.csv')
df = df.fillna('')
print(df.head())

                                             Comment  ... HumanSentiment
0                    Good job everyone and happy 4th  ...       positive
1  Holding my CLOV call! Tendies for breakfast ar...  ...       positive
2  I went to ikea with my cousin today and holy s...  ...       negative
3  I’m a fucking moron and I’m never investing again  ...       negative
4  I can’t wait to go all in on TSLA calls for ne...  ...       positive

[5 rows x 3 columns]


In [3]:
#PARTITION AND SPLIT DATA
df2 = df[['ProcessedComments', 'HumanSentiment']]

test = df['HumanSentiment'].value_counts()

df2['sentiment_numeric']  = pd.factorize(df2["HumanSentiment"])[0] 

dictionary = pd.Series(df2["HumanSentiment"].values,index=df2['sentiment_numeric']).to_dict()
print(dictionary)
np.save('sentiments.npy', dictionary)

df2 = df2[['ProcessedComments', 'sentiment_numeric']]
print(df2.head())
print(len(df2))

from sklearn.model_selection import train_test_split
df_subset_train_model, df_subset_val_model = train_test_split(df2, test_size=0.1)


df2 = df_subset_train_model[['ProcessedComments','sentiment_numeric']]
print(len(df2))

{0: 'positive', 1: 'negative', 2: 'neutral'}
                                   ProcessedComments  sentiment_numeric
0                    Good job everyone and happy 4th                  0
1  Holding my CLOV call Tendies for breakfast are...                  0
2  I went to ikea with my cousin today and holy s...                  1
3    Im a fucking moron and Im never investing again                  1
4  I cant wait to go all in on TSLA calls for nex...                  0
2500
2250


In [4]:
#%ROBERTA SETUP
seq_len = 512
num_samples = len(df2)

Xids = np.zeros((num_samples, 512))
Xmask = np.zeros((num_samples, 512))

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

#from transformers import AutoTokenizer, TFAutoModel
#tokenizer = AutoTokenizer.from_pretrained("kamalkraj/deberta-base")

for i, phrase in enumerate(df2['ProcessedComments']):
    tokens = tokenizer.encode_plus(phrase, max_length=seq_len, truncation=True,
                                   padding='max_length', add_special_tokens=True,
                                   return_tensors='tf')
    Xids[i, :] = tf.cast(tokens['input_ids'],tf.float64)
    Xmask[i, :] = tf.cast(tokens['attention_mask'],tf.float64)
    
arr = df2['sentiment_numeric'].values
print(arr)

labels = np.zeros((num_samples, arr.max()+1))
print(labels.shape)

labels[np.arange(num_samples), arr] = 1


[0 2 2 ... 1 2 1]
(2250, 3)


In [5]:
#PREPARE DATA FOR TF MODEL
import tensorflow as tf

dataset = tf.data.Dataset.from_tensor_slices((Xids,Xmask,labels))

def map_func(input_ids, masks, labels):                                    
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

dataset = dataset.map(map_func)

batch_size = 13
dataset = dataset.shuffle(10000).batch(batch_size,drop_remainder=True)

split = 0.80
size = int((num_samples/batch_size) * split)

train_ds = dataset.take(size)
val_ds = dataset.skip(size)
print(train_ds)
print(val_ds)
del dataset

<TakeDataset shapes: ({input_ids: (13, 512), attention_mask: (13, 512)}, (13, 3)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>
<SkipDataset shapes: ({input_ids: (13, 512), attention_mask: (13, 512)}, (13, 3)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>


In [6]:
#%%RUN TF MODEL 100 EPOCHS

from transformers import TFAutoModel,RobertaTokenizer, TFRobertaModel
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, min_delta = 0.1, verbose = 1)

bert = TFRobertaModel.from_pretrained("roberta-base")
print(bert.summary())

input_ids = tf.keras.layers.Input(shape=(seq_len,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name='attention_mask', dtype='int32')

embeddings = bert.roberta(input_ids, attention_mask=mask)[1]

x = tf.keras.layers.Dense(1024,activation='relu')(embeddings)
y = tf.keras.layers.Dense(arr.max()+1,activation='softmax', name = 'outputs')(x)

model = tf.keras.Model(inputs=[input_ids,mask], outputs=y)
print(model.summary())

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-7,decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss = loss, metrics = [acc])

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs = 100#,
    #callbacks=[early_stopping]
)

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "tf_roberta_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
roberta (TFRobertaMainLayer) multiple                  124645632 
Total params: 124,645,632
Trainable params: 124,645,632
Non-trainable params: 0
_________________________________________________________________
None
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
roberta (TFRober

In [17]:
#SAVE MODEL
model.save('sentiment_model_teamsentiment')

!zip -r /content/sentiment_model_teamsentiment.zip /content/sentiment_model_teamsentiment

from google.colab import files
files.download("/content/sentiment_model_teamsentiment.zip")



INFO:tensorflow:Assets written to: sentiment_model_teamsentiment/assets


INFO:tensorflow:Assets written to: sentiment_model_teamsentiment/assets


  adding: content/sentiment_model_teamsentiment/ (stored 0%)
  adding: content/sentiment_model_teamsentiment/keras_metadata.pb (deflated 95%)
  adding: content/sentiment_model_teamsentiment/saved_model.pb (deflated 92%)
  adding: content/sentiment_model_teamsentiment/assets/ (stored 0%)
  adding: content/sentiment_model_teamsentiment/variables/ (stored 0%)
  adding: content/sentiment_model_teamsentiment/variables/variables.data-00000-of-00001 (deflated 27%)
  adding: content/sentiment_model_teamsentiment/variables/variables.index (deflated 80%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
#%% TEST MODEL

from sklearn.metrics import classification_report

def prep_data(text):
    tokens = tokenizer.encode_plus(text, max_length=512, truncation=True,
                                   padding='max_length', add_special_tokens=True,#return_token_type_id=False,
                                   return_tensors='tf')
    return{
        'input_ids': tf.cast(tokens['input_ids'], tf.float64),
        'attention_mask': tf.cast(tokens['attention_mask'], tf.float64)
        }

#test = prep_data('I hate all this')
#test = prep_data('GME to the Moon!')
#test = prep_data('I love this')
probs = model.predict(test)

np.argmax(probs[0])


1

In [14]:
# TEST MODEL ON VALIDATION DATA
df_subset_val_model = df_subset_val_model.reset_index(drop=True)
#del df_subset_val_model['index']
df_subset_val_model.head()

cor = df_subset_val_model['ProcessedComments'].apply(prep_data)


pred = [np.argmax((model.predict(cor[i]))[0]) for i in tqdm(range(len(cor)))]
pred2 = [model.predict(cor[i]) for i in tqdm(range(len(cor)))]
df_subset_val_model['predsentiment'] = pred
df_subset_val_model['predsentiment2'] = pred2

print(df_subset_val_model.head())

 96%|█████████▌| 240/250 [00:22<00:00, 10.67it/s][A
 97%|█████████▋| 242/250 [00:23<00:00, 10.57it/s][A
 98%|█████████▊| 244/250 [00:23<00:00, 10.47it/s][A
 98%|█████████▊| 246/250 [00:23<00:00, 10.49it/s][A
 99%|█████████▉| 248/250 [00:23<00:00, 10.36it/s][A
100%|██████████| 250/250 [00:23<00:00, 10.47it/s]

                                   ProcessedComments  ...                          predsentiment2
0  Stocks only go up Unless youre Chinese in whic...  ...   [[0.09437902, 0.5947556, 0.31086537]]
1  Seriously considering contacting the crazy ex ...  ...  [[0.04620931, 0.8963726, 0.057418104]]
2  Pltr is a long hold why is everyone in such a ...  ...   [[0.07441355, 0.7969895, 0.12859693]]
3  i just want to get rich enough to fuck AOC in ...  ...   [[0.32733282, 0.4882121, 0.18445505]]
4  Trading hack for you rookies Become a pattern ...  ...  [[0.16728131, 0.49700892, 0.33570972]]

[5 rows x 4 columns]





In [15]:
#%% CLASSIFICATION REPORT
print('\nclassification report:\n', classification_report(df_subset_val_model['sentiment_numeric'],df_subset_val_model['predsentiment']))

df_confusion = pd.crosstab(df_subset_val_model['sentiment_numeric'], df_subset_val_model['predsentiment'], rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)


classification report:
               precision    recall  f1-score   support

           0       0.67      0.71      0.69       107
           1       0.61      0.83      0.70        82
           2       0.54      0.21      0.31        61

    accuracy                           0.63       250
   macro avg       0.61      0.58      0.56       250
weighted avg       0.62      0.63      0.60       250

Predicted    0    1   2  All
Actual                      
0           76   25   6  107
1            9   68   5   82
2           29   19  13   61
All        114  112  24  250


In [16]:
df_subset_val_model.predsentiment2.apply(str).value_counts()

[[0.66461766 0.04147586 0.29390648]]    2
[[0.83657694 0.0675445  0.0958785 ]]    1
[[0.07257944 0.68512994 0.24229051]]    1
[[0.8139321  0.0926933  0.09337454]]    1
[[0.1391712 0.638525  0.2223038]]       1
                                       ..
[[0.03962601 0.9002433  0.06013076]]    1
[[0.78351676 0.07291437 0.14356884]]    1
[[0.1511761  0.7352238  0.11360016]]    1
[[0.64896655 0.20395236 0.14708109]]    1
[[0.43210772 0.31224355 0.25564873]]    1
Name: predsentiment2, Length: 249, dtype: int64

In [None]:
df_subset_train_model = df_subset_train_model.reset_index(drop=True)
#del df_subset_val_model['index']
df_subset_train_model.head()

cor = df_subset_train_model['ProcessedComments'].apply(prep_data)


pred = [np.argmax((model.predict(cor[i]))[0]) for i in tqdm(range(len(cor)))]
pred2 = [model.predict(cor[i]) for i in tqdm(range(len(cor)))]
df_subset_train_model['predsentiment'] = pred
df_subset_train_model['predsentiment2'] = pred2

print(df_subset_train_model.head())

#%%
print('\nclassification report:\n', classification_report(df_subset_train_model['sentiment_numeric'],df_subset_train_model['predsentiment']))

df_confusion = pd.crosstab(df_subset_train_model['sentiment_numeric'], df_subset_train_model['predsentiment'], rownames=['Actual'], colnames=['Predicted'], margins=True)
print(df_confusion)

print(df_subset_train_model.predsentiment2.apply(str).value_counts())