In [2]:
import warnings
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

warnings.filterwarnings("ignore")
pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 2000

In [3]:
twitter_dataset = pd.read_csv('twitter_dataset.csv')
twitter_dataset[pd.isna(twitter_dataset.Ticker) == True].shape

(0, 5)

In [4]:
#!g2.mig
import tensorflow as tf

device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


2023-01-18 16:21:02.736902: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-18 16:21:03.281493: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /device:GPU:0 with 2629 MB memory:  -> device: 0, name: GRID A100X-1-5C MIG 1g.5gb, pci bus id: 0000:8c:00.0, compute capability: 8.0


In [5]:
#!g2.mig
import torch

if torch.cuda.is_available():    

    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: GRID A100X-1-5C MIG 1g.5gb


In [6]:
#!g2.mig
twitter_dataset.shape

(870439, 5)

In [11]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
#!g2.mig
import torch
from torch.utils.data import TensorDataset, SequentialSampler, DataLoader
def predict(sentences, device, finbert, tokenizer):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    input_ids = inputs['input_ids']
    attention_masks = inputs['attention_mask']

    batch_size = 32
    prediction_data = TensorDataset(input_ids, attention_masks)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
    model.to(device)
    model.eval()
    predictions = []
    number = 0
    for batch in prediction_dataloader:

        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask = batch

        with torch.no_grad():
     
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask)
        for logit in outputs.logits:

          logits = logit
          predicted_class_id = logits.argmax().item()
          logits = logits.detach().cpu().numpy()
          predicted_class = finbert.config.id2label[predicted_class_id]
          number+=1
          predictions.append(predicted_class)
    return predictions

In [16]:
#!g2.mig
sentences = twitter_dataset.Headline.tolist()
count = 0
tokenize_sentences = []
predictions = []
for sent in sentences:
    count += 1
    tokenize_sentences.append(sent)
    if count >= 1000:
        predictions.extend(predict(tokenize_sentences, device, model, tokenizer))
        tokenize_sentences = []
        count = 0
    else:
        if len(predictions) == 870000 and count == 439:
            predictions.extend(predict(tokenize_sentences, device, model, tokenizer))
            tokenize_sentences = []
            count = 0

In [20]:
#!g2.mig
len(predictions)

870439

In [22]:
#!g2.mig


In [24]:
#!g2.mig
def count_labels(label):
  counter = 0
  for prediction in predictions:
    if prediction == label:
      counter += 1
  return counter

In [26]:
#!g2.mig
print('Positive', count_labels('positive'))
print('Negative', count_labels('negative'))
print('Neutral', count_labels('neutral'))

Positive 389743
Negative 87879
Neutral 392817


In [31]:
#!g2.mig
twitter_dataset['Sentiment'] = predictions
twitter_dataset['Topic'] = 'Twitter news'

In [32]:
#!g2.mig
twitter_dataset.to_csv('twitter_dataset.csv', sep=',')

In [None]:
#!g2.mig
