In [None]:
!pip install transformers



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, pipeline, BertModel
import transformers
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import datetime
from datetime import timedelta, timezone
import sys
transformers.__version__

codes = ['AAPL','AMZN','C','GOOG','JPM','NFLX','PLTR']

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
from __future__ import absolute_import, division, print_function

import random

import pandas as pd
from torch.nn import MSELoss, CrossEntropyLoss
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
    TensorDataset)
from tqdm import tqdm_notebook as tqdm
from tqdm import trange
from nltk.tokenize import sent_tokenize
import numpy as np
import logging

from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer

logger = logging.getLogger(__name__)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

class Config(object):
    """The configuration class for training."""

    def __init__(self,
                 data_dir,
                 bert_model,
                 model_dir,
                 max_seq_length=64,
                 train_batch_size=32,
                 eval_batch_size=32,
                 learning_rate=5e-5,
                 num_train_epochs=10.0,
                 warm_up_proportion=0.1,
                 no_cuda=False,
                 do_lower_case=True,
                 seed=42,
                 local_rank=-1,
                 gradient_accumulation_steps=1,
                 fp16=False,
                 output_mode='classification',
                 discriminate=True,
                 gradual_unfreeze=True,
                 encoder_no=12,
                 base_model='bert-base-uncased'):
        """
        Parameters
        ----------
        data_dir: str
            Path for the training and evaluation datasets.
        bert_model: BertModel
            The BERT model to be used. For example: BertForSequenceClassification.from_pretrained(...)
        model_dir: str
            The path where the resulting model will be saved.
        max_seq_length: int
            The maximum length of the sequence to be used. Default value is 64.
        train_batch_size: int
            The batch size for the training. Default value is 32.
        eval_batch_size: int
            The batch size for the evaluation. Default value is 32.
        learning_rate: float
            The learning rate. Default value is 5e5.
        num_train_epochs: int
            Number of epochs to train. Default value is 4.
        warm_up_proportion: float
            During the training, the learning rate is linearly increased. This value determines when the learning rate
            reaches the intended learning rate. Default value is 0.1.
        no_cuda: bool
            Determines whether to use gpu. Default is False.
        do_lower_case: bool
            Determines whether to make all training and evaluation examples lower case. Default is True.
        seed: int
            Random seed. Defaults to 42.
        local_rank: int
            Used for number of gpu's that will be utilized. If set -1, no distributed training will be done. Default
            value is -1.
        gradient_accumulation_steps: int
            Number of gradient accumulations steps. Defaults to 1.
        fp16: bool
            Determines whether to use 16 bits for floats, instead of 32.
        output_mode: 'classification' or 'regression'
            Determines whether the task is classification or regression.
        discriminate: bool
            Determines whether to apply discriminative fine-tuning.
        gradual_unfreeze: bool
            Determines whether to gradually unfreeze lower and lower layers as the training goes on.
        encoder_no: int
            Starting from which layer the model is going to be finetuned. If set 12, whole model is going to be
            fine-tuned. If set, for example, 6, only the last 6 layers will be fine-tuned.
        """
        self.data_dir = data_dir
        self.bert_model = bert_model
        self.model_dir = model_dir
        self.do_lower_case = do_lower_case
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.local_rank = local_rank
        self.eval_batch_size = eval_batch_size
        self.learning_rate = learning_rate
        self.num_train_epochs = num_train_epochs
        self.warm_up_proportion = warm_up_proportion
        self.no_cuda = no_cuda
        self.seed = seed
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.output_mode = output_mode
        self.fp16 = fp16
        self.discriminate = discriminate
        self.gradual_unfreeze = gradual_unfreeze
        self.encoder_no = encoder_no
        self.base_model = base_model


class FinBert(object):
    """
    The main class for FinBERT.
    """

    def __init__(self,
                 config):
        self.config = config

    def prepare_model(self, label_list):
        """
        Sets some of the components of the model: Dataset processor, number of labels, usage of gpu and distributed
        training, gradient accumulation steps and tokenizer.
        Parameters
        ----------
        label_list: list
            The list of labels values in the dataset. For example: ['positive','negative','neutral']
        """

        self.processors = {
            "finsent": FinSentProcessor
        }

        self.num_labels_task = {
            'finsent': 2
        }

        if self.config.local_rank == -1 or self.config.no_cuda:
            self.device = torch.device("cuda" if torch.cuda.is_available() and not self.config.no_cuda else "cpu")
            self.n_gpu = torch.cuda.device_count()
        else:
            torch.cuda.set_device(self.config.local_rank)
            self.device = torch.device("cuda", self.config.local_rank)
            self.n_gpu = 1
            # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
            torch.distributed.init_process_group(backend='nccl')
        logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
            self.device, self.n_gpu, bool(self.config.local_rank != -1), self.config.fp16))

        if self.config.gradient_accumulation_steps < 1:
            raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                self.config.gradient_accumulation_steps))

        self.config.train_batch_size = self.config.train_batch_size // self.config.gradient_accumulation_steps

        random.seed(self.config.seed)
        np.random.seed(self.config.seed)
        torch.manual_seed(self.config.seed)

        if self.n_gpu > 0:
            torch.cuda.manual_seed_all(self.config.seed)

        if os.path.exists(self.config.model_dir) and os.listdir(self.config.model_dir):
            raise ValueError("Output directory ({}) already exists and is not empty.".format(self.config.model_dir))
        if not os.path.exists(self.config.model_dir):
            os.makedirs(self.config.model_dir)

        self.processor = self.processors['finsent']()
        self.num_labels = len(label_list)
        self.label_list = label_list

        self.tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

    def get_data(self, phase):
        """
        Gets the data for training or evaluation. It returns the data in the format that pytorch will process. In the
        data directory, there should be a .csv file with the name <phase>.csv
        Parameters
        ----------
        phase: str
            Name of the dataset that will be used in that phase. For example if there is a 'train.csv' in the data
            folder, it should be set to 'train'.
        Returns
        -------
        examples: list
            A list of InputExample's. Each InputExample is an object that includes the information for each example;
            text, id, label...
        """

        self.num_train_optimization_steps = None
        examples = None
        examples = self.processor.get_examples(self.config.data_dir, phase)
        self.num_train_optimization_steps = int(
            len(
                examples) / self.config.train_batch_size / self.config.gradient_accumulation_steps) * self.config.num_train_epochs

        if phase == 'train':
            train = pd.read_csv(os.path.join(self.config.data_dir, 'train.csv'), sep='\t', index_col=False)
            weights = list()
            labels = self.label_list

            class_weights = [train.shape[0] / train[train.label == label].shape[0] for label in labels]
            self.class_weights = torch.tensor(class_weights)

        return examples

    def create_the_model(self):
        """
        Creates the model. Sets the model to be trained and the optimizer.
        """

        model = self.config.bert_model

        model.to(self.device)

        # Prepare optimizer
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        lr = self.config.learning_rate
        dft_rate = 1.2

        if self.config.discriminate:
            # apply the discriminative fine-tuning. discrimination rate is governed by dft_rate.

            encoder_params = []
            for i in range(12):
                encoder_decay = {
                    'params': [p for n, p in list(model.bert.encoder.layer[i].named_parameters()) if
                               not any(nd in n for nd in no_decay)],
                    'weight_decay': 0.01,
                    'lr': lr / (dft_rate ** (12 - i))}
                encoder_nodecay = {
                    'params': [p for n, p in list(model.bert.encoder.layer[i].named_parameters()) if
                               any(nd in n for nd in no_decay)],
                    'weight_decay': 0.0,
                    'lr': lr / (dft_rate ** (12 - i))}
                encoder_params.append(encoder_decay)
                encoder_params.append(encoder_nodecay)

            optimizer_grouped_parameters = [
                {'params': [p for n, p in list(model.bert.embeddings.named_parameters()) if
                            not any(nd in n for nd in no_decay)],
                 'weight_decay': 0.01,
                 'lr': lr / (dft_rate ** 13)},
                {'params': [p for n, p in list(model.bert.embeddings.named_parameters()) if
                            any(nd in n for nd in no_decay)],
                 'weight_decay': 0.0,
                 'lr': lr / (dft_rate ** 13)},
                {'params': [p for n, p in list(model.bert.pooler.named_parameters()) if
                            not any(nd in n for nd in no_decay)],
                 'weight_decay': 0.01,
                 'lr': lr},
                {'params': [p for n, p in list(model.bert.pooler.named_parameters()) if
                            any(nd in n for nd in no_decay)],
                 'weight_decay': 0.0,
                 'lr': lr},
                {'params': [p for n, p in list(model.classifier.named_parameters()) if
                            not any(nd in n for nd in no_decay)],
                 'weight_decay': 0.01,
                 'lr': lr},
                {'params': [p for n, p in list(model.classifier.named_parameters()) if any(nd in n for nd in no_decay)],
                 'weight_decay': 0.0,
                 'lr': lr}]

            optimizer_grouped_parameters.extend(encoder_params)


        else:
            param_optimizer = list(model.named_parameters())

            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                 'weight_decay': 0.01},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        schedule = "warmup_linear"


        self.num_warmup_steps = int(float(self.num_train_optimization_steps) * self.config.warm_up_proportion)

        self.optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.config.learning_rate,
                          correct_bias=False)

        self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
                                                    num_warmup_steps=self.num_warmup_steps,
                                                    num_training_steps=self.num_train_optimization_steps)

        return model

    def get_loader(self, examples, phase):
        """
        Creates a data loader object for a dataset.
        Parameters
        ----------
        examples: list
            The list of InputExample's.
        phase: 'train' or 'eval'
            Determines whether to use random sampling or sequential sampling depending on the phase.
        Returns
        -------
        dataloader: DataLoader
            The data loader object.
        """

        features = convert_examples_to_features(examples, self.label_list,
                                                self.config.max_seq_length,
                                                self.tokenizer,
                                                self.config.output_mode)

        # Log the necessasry information
        logger.info("***** Loading data *****")
        logger.info("  Num examples = %d", len(examples))
        logger.info("  Batch size = %d", self.config.train_batch_size)
        logger.info("  Num steps = %d", self.num_train_optimization_steps)

        # Load the data, make it into TensorDataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

        if self.config.output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        elif self.config.output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)

        try:
            all_agree_ids = torch.tensor([f.agree for f in features], dtype=torch.long)
        except:
            all_agree_ids = torch.tensor([0.0 for f in features], dtype=torch.long)

        data = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids, all_agree_ids)

        # Distributed, if necessary
        if phase == 'train':
            my_sampler = RandomSampler(data)
        elif phase == 'eval':
            my_sampler = SequentialSampler(data)

        dataloader = DataLoader(data, sampler=my_sampler, batch_size=self.config.train_batch_size)
        return dataloader

    def train(self, train_examples, model):
        """
        Trains the model.
        Parameters
        ----------
        examples: list
            Contains the data as a list of InputExample's
        model: BertModel
            The Bert model to be trained.
        weights: list
            Contains class weights.
        Returns
        -------
        model: BertModel
            The trained model.
        """

        validation_examples = self.get_data('validation')

        global_step = 0

        self.validation_losses = []

        # Training
        train_dataloader = self.get_loader(train_examples, 'train')

        model.train()

        step_number = len(train_dataloader)

        i = 0
        for _ in trange(int(self.config.num_train_epochs), desc="Epoch"):

            model.train()

            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            for step, batch in enumerate(tqdm(train_dataloader, desc='Iteration')):

                if (self.config.gradual_unfreeze and i == 0):
                    for param in model.bert.parameters():
                        param.requires_grad = False

                if (step % (step_number // 3)) == 0:
                    i += 1

                if (self.config.gradual_unfreeze and i > 1 and i < self.config.encoder_no):

                    for k in range(i - 1):

                        try:
                            for param in model.bert.encoder.layer[self.config.encoder_no - 1 - k].parameters():
                                param.requires_grad = True
                        except:
                            pass

                if (self.config.gradual_unfreeze and i > self.config.encoder_no + 1):
                    for param in model.bert.embeddings.parameters():
                        param.requires_grad = True

                batch = tuple(t.to(self.device) for t in batch)

                input_ids, attention_mask, token_type_ids, label_ids, agree_ids = batch

                logits = model(input_ids, attention_mask, token_type_ids)[0]
                weights = self.class_weights.to(self.device)

                if self.config.output_mode == "classification":
                    loss_fct = CrossEntropyLoss(weight=weights)
                    loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1))
                elif self.config.output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if self.config.gradient_accumulation_steps > 1:
                    loss = loss / self.config.gradient_accumulation_steps
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % self.config.gradient_accumulation_steps == 0:
                    if self.config.fp16:
                        lr_this_step = self.config.learning_rate * warmup_linear(
                            global_step / self.num_train_optimization_steps, self.config.warm_up_proportion)
                        for param_group in self.optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    self.optimizer.step()
                    self.scheduler.step()
                    self.optimizer.zero_grad()
                    global_step += 1

            # Validation

            validation_loader = self.get_loader(validation_examples, phase='eval')
            model.eval()

            valid_loss, valid_accuracy = 0, 0
            nb_valid_steps, nb_valid_examples = 0, 0

            for input_ids, attention_mask, token_type_ids, label_ids, agree_ids in tqdm(validation_loader, desc="Validating"):
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                token_type_ids = token_type_ids.to(self.device)
                label_ids = label_ids.to(self.device)
                agree_ids = agree_ids.to(self.device)

                with torch.no_grad():
                    logits = model(input_ids, attention_mask, token_type_ids)[0]

                    if self.config.output_mode == "classification":
                        loss_fct = CrossEntropyLoss(weight=weights)
                        tmp_valid_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1))
                    elif self.config.output_mode == "regression":
                        loss_fct = MSELoss()
                        tmp_valid_loss = loss_fct(logits.view(-1), label_ids.view(-1))

                    valid_loss += tmp_valid_loss.mean().item()

                    nb_valid_steps += 1

            valid_loss = valid_loss / nb_valid_steps

            self.validation_losses.append(valid_loss)
            print("Validation losses: {}".format(self.validation_losses))

            if valid_loss == min(self.validation_losses):

                try:
                    os.remove(self.config.model_dir +'/'+ ('temporary' + str(best_model)))
                except:
                    print('No best model found')
                torch.save({'epoch': str(i), 'state_dict': model.state_dict()},
                           self.config.model_dir +'/'+ ('temporary' + str(i)))
                best_model = i

        # Save a trained model and the associated configuration
        checkpoint = torch.load(self.config.model_dir +'/'+ ('temporary' + str(best_model)))
        model.load_state_dict(checkpoint['state_dict'])
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(self.config.model_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(self.config.model_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        os.remove(self.config.model_dir +'/'+ ('temporary' + str(best_model)))
        return model

    def evaluate(self, model, examples):
        """
        Evaluate the model.
        Parameters
        ----------
        model: BertModel
            The model to be evaluated.
        examples: list
            Evaluation data as a list of InputExample's/
        Returns
        -------
        evaluation_df: pd.DataFrame
            A dataframe that includes for each example predicted probability and labels.
        """

        eval_loader = self.get_loader(examples, phase='eval')

        logger.info("***** Running evaluation ***** ")
        logger.info("  Num examples = %d", len(examples))
        logger.info("  Batch size = %d", self.config.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        predictions = []
        labels = []
        agree_levels = []
        text_ids = []

        for input_ids, attention_mask, token_type_ids, label_ids, agree_ids in tqdm(eval_loader, desc="Testing"):
            input_ids = input_ids.to(self.device)
            attention_mask = attention_mask.to(self.device)
            token_type_ids = token_type_ids.to(self.device)
            label_ids = label_ids.to(self.device)
            agree_ids = agree_ids.to(self.device)

            with torch.no_grad():
                logits = model(input_ids, attention_mask, token_type_ids)[0]

                if self.config.output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1))
                elif self.config.output_mode == "regression":
                    loss_fct = MSELoss()
                    tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

                np_logits = logits.cpu().numpy()

                if self.config.output_mode == 'classification':
                    prediction = np.array(np_logits)
                elif self.config.output_mode == "regression":
                    prediction = np.array(np_logits)

                for agree_id in agree_ids:
                    agree_levels.append(agree_id.item())

                for label_id in label_ids:
                    labels.append(label_id.item())

                for pred in prediction:
                    predictions.append(pred)

                text_ids.append(input_ids)

                # tmp_eval_loss = loss_fct(logits.view(-1, self.num_labels), label_ids.view(-1))
                # tmp_eval_loss = model(input_ids, token_type_ids, attention_mask, label_ids)

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1

            # logits = logits.detach().cpu().numpy()
            # label_ids = label_ids.to('cpu').numpy()
            # tmp_eval_accuracy = accuracy(logits, label_ids)

            # eval_loss += tmp_eval_loss.mean().item()
            # eval_accuracy += tmp_eval_accuracy

        evaluation_df = pd.DataFrame({'predictions': predictions, 'labels': labels, "agree_levels": agree_levels})

        return evaluation_df


def predict(text, model, write_to_csv=False, path=None, use_gpu=False, gpu_name='cuda:0', batch_size=5):
    """
    Predict sentiments of sentences in a given text. The function first tokenizes sentences, make predictions and write
    results.
    Parameters
    ----------
    text: string
        text to be analyzed
    model: BertForSequenceClassification
        path to the classifier model
    write_to_csv (optional): bool
    path (optional): string
        path to write the string
    use_gpu: (optional): bool
        enables inference on GPU
    gpu_name: (optional): string
        multi-gpu support: allows specifying which gpu to use
    batch_size: (optional): int
        size of batching chunks
    """
    model.eval()

    sentences = sent_tokenize(text)

    device = gpu_name if use_gpu and torch.cuda.is_available() else "cpu"
    logging.info("Using device: %s " % device)
    label_list = ['positive', 'negative', 'neutral']
    label_dict = {0: 'positive', 1: 'negative', 2: 'neutral'}
    result = pd.DataFrame(columns=['sentence', 'logit', 'prediction', 'sentiment_score'])
    for batch in chunks(sentences, batch_size):
        examples = [InputExample(str(i), sentence) for i, sentence in enumerate(batch)]

        features = convert_examples_to_features(examples, label_list, 64, tokenizer)

        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long).to(device)
        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long).to(device)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long).to(device)

        with torch.no_grad():
            model     = model.to(device)

            logits = model(all_input_ids, all_attention_mask, all_token_type_ids)[0]
            logging.info(logits)
            logits = softmax(np.array(logits.cpu()))
            sentiment_score = pd.Series(logits[:, 0] - logits[:, 1])
            predictions = np.squeeze(np.argmax(logits, axis=1))

            batch_result = {'sentence': batch,
                            'logit': list(logits),
                            'prediction': predictions,
                            'sentiment_score': sentiment_score}

            batch_result = pd.DataFrame(batch_result)
            result = pd.concat([result, batch_result], ignore_index=True)

    result['prediction'] = result.prediction.apply(lambda x: label_dict[x])
    if write_to_csv:
        result.to_csv(path, sep=',', index=False)

    return result
# The classes used for data processing and convert_examples_to_features are very similar versions of the ones \
# found in Hugging Face's scripts in the transformers library. For more BERT or similar language model implementation \
# examples, we would highly recommend checking that library as well.


from __future__ import absolute_import, division, print_function

import csv
import sys
import os
import torch

import numpy as np
import logging

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)


# Classes regarding input and data handling

WEIGHTS_NAME = 'pytorch_model.bin'
CONFIG_NAME = 'config.json'

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text, label=None, agree=None):
        """
        Constructs an InputExample
        Parameters
        ----------
        guid: str
            Unique id for the examples
        text: str
            Text for the first sequence.
        label: str, optional
            Label for the example.
        agree: str, optional
            For FinBERT , inter-annotator agreement level.
        """
        self.guid = guid
        self.text = text
        self.label = label
        self.agree = agree


class InputFeatures(object):
    """
    A single set of features for the data.
    """

    def __init__(self, input_ids, attention_mask, token_type_ids, label_id, agree=None):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.token_type_ids = token_type_ids
        self.label_id = label_id
        self.agree = agree


class DataProcessor(object):
    """Base class to read data files."""

    @classmethod
    def _read_tsv(cls, input_file):
        """Reads a tab separated value file."""
        with open(input_file, "r", encoding='utf-8') as f:
            reader = csv.reader(f, delimiter="\t")
            lines = []
            for line in reader:
                if sys.version_info[0] == 2:
                    line = list(unicode(cell, 'utf-8') for cell in line)
                lines.append(line)
        return lines


class FinSentProcessor(DataProcessor):
    """
    Data processor for FinBERT.
    """

    def get_examples(self, data_dir, phase):
        """
        Get examples from the data directory.

        Parameters
        ----------
        data_dir: str
            Path for the data directory.
        phase: str
            Name of the .csv file to be loaded.
        """
        return self._create_examples(self._read_tsv(os.path.join(data_dir, (phase + ".csv"))), phase)

    def get_labels(self):
        return ["positive", "negative", "neutral"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, str(i))
            text = line[1]
            label = line[2]
            try:
                agree = line[3]
            except:
                agree = None
            examples.append(
                InputExample(guid=guid, text=text, label=label, agree=agree))
        return examples


def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, mode='classification'):
    """
    Loads a data file into a list of InputBatch's. With this function, the InputExample's are converted to features
    that can be used for the model. Text is tokenized, converted to ids and zero-padded. Labels are mapped to integers.

    Parameters
    ----------
    examples: list
        A list of InputExample's.
    label_list: list
        The list of labels.
    max_seq_length: int
        The maximum sequence length.
    tokenizer: BertTokenizer
        The tokenizer to be used.
    mode: str, optional
        The task type: 'classification' or 'regression'. Default is 'classification'

    Returns
    -------
    features: list
        A list of InputFeature's, which is an InputBatch.
    """

    if mode == 'classification':
        label_map = {label: i for i, label in enumerate(label_list)}
        label_map[None] = 9090

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens = tokenizer.tokenize(example.text)

        if len(tokens) > max_seq_length - 2:
            tokens = tokens[:(max_seq_length // 4) - 1] + tokens[
                                                          len(tokens) - (3 * max_seq_length // 4) + 1:]

        tokens = ["[CLS]"] + tokens + ["[SEP]"]

        token_type_ids = [0] * len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        attention_mask = [1] * len(input_ids)

        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        attention_mask += padding


        token_type_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(attention_mask) == max_seq_length
        assert len(token_type_ids) == max_seq_length

        if mode == 'classification':
            label_id = label_map[example.label]
        elif mode == 'regression':
            label_id = float(example.label)
        else:
            raise ValueError("The mode should either be classification or regression. You entered: " + mode)

        agree = example.agree
        mapagree = {'0.5': 1, '0.66': 2, '0.75': 3, '1.0': 4}
        try:
            agree = mapagree[agree]
        except:
            agree = 0

        if ex_index < 1:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info(
                "token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
            InputFeatures(input_ids=input_ids,
                          attention_mask=attention_mask,
                          token_type_ids=token_type_ids,
                          label_id=label_id,
                          agree=agree))
    return features


def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1)[:, None])
    return e_x / np.sum(e_x, axis=1)[:, None]


def get_metrics(df):
    "Computes accuracy and precision-recall for different sentiments."

    df.loc[:, 'guess'] = df.predictions.apply(np.argmax)
    df.loc[:, 'accurate'] = df.apply(lambda x: x['guess'] == x['labels'], axis=1)
    accuracy = df.accurate.sum() / df.shape[0]

    pos_recall = df[df['labels'] == 0].accurate.sum() / df[df['labels'] == 0].shape[0]
    neg_recall = df[df['labels'] == 1].accurate.sum() / df[df['labels'] == 1].shape[0]
    net_recall = df[df['labels'] == 2].accurate.sum() / df[df['labels'] == 2].shape[0]

    pos_precision = df[df['guess'] == 0].accurate.sum() / df[df['guess'] == 0].shape[0]
    neg_precision = df[df['guess'] == 1].accurate.sum() / df[df['guess'] == 1].shape[0]
    net_precision = df[df['guess'] == 2].accurate.sum() / df[df['guess'] == 2].shape[0]

    pos_f1score = 2 * (pos_precision * pos_recall) / (pos_precision + pos_recall)
    neg_f1score = 2 * (neg_precision * neg_recall) / (neg_precision + neg_recall)
    net_f1score = 2 * (net_precision * net_recall) / (net_precision + net_recall)

    return {'Accuracy': accuracy,
            'Positive': {'precision': pos_precision, 'recall': pos_recall, 'f1-score': pos_f1score}, 'Negative': \
                {'precision': neg_precision, 'recall': neg_recall, 'f1-score': neg_f1score},
            'Neutral': {'precision': net_precision, 'recall': net_recall, 'f1-score': net_f1score}}


def get_prediction(text, model, tokenizer):
    """
    Get one prediction.

    Parameters
    ----------
    text: str
        The text to be analyzed.
    model: BertModel
        The model to be used.
    tokenizer: BertTokenizer
        The tokenizer to be used.

    Returns
    -------
    predition: np.array
        An array that includes probabilities for each class.
    """

    tokens = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    token_type_ids = [0] * len(tokens)
    attention_mask = [1] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    padding = [0] * (64 - len(input_ids))
    input_ids += padding
    attention_mask += padding
    token_type_ids += padding

    features = []
    features.append(
        InputFeatures(input_ids=input_ids,
                      token_type_ids=token_type_ids,
                      attention_mask=attention_mask,
                      label_id=None))

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)

    model.eval()
    prediction = softmax(model(all_input_ids, all_attention_mask, all_token_type_ids).detach().numpy())
    return prediction


def chunks(l, n):
    """
    Simple utility function to split a list into fixed-length chunks.
    Parameters
    ----------
    l: list
        given list
    n: int
        length of the sequence
    """
    for i in range(0, len(l), n):
        # Create an index range for l of n items:
        yield l[i:i + n]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

In [None]:
from google.colab import drive

drive.mount("/content/drive")
path="/content/drive/MyDrive/Stock_Data/"

stock_data = []
for code in codes:
    data = pd.read_csv(path+code+'.csv')
    ##data = pd.read_csv('Stock_Data/'+code+'.csv')
    data['Body'] = data['Body'].astype(str)
    data['length'] = [len(x) for x in data['Body']]
    data = data[data.length>10]
    data['Body'] = [x if len(x)<=512 else x[:512] for x in data.Body ]
    data.head()
    stock_data.append(data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Body'] = [x if len(x)<=512 else x[:512] for x in data.Body ]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Body'] = [x if len(x)<=512 else x[:512] for x in data.Body ]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Body'] = [x if len(x)<=512 else x[:512] for x in data.Body ]


In [None]:
data = pd.concat(stock_data)

In [None]:
data['Created At'][data['Created At'].isna()] = data[data['Created At'].isna()]['Created  At']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Created At'][data['Created At'].isna()] = data[data['Created At'].isna()]['Created  At']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Created At'][data['Created At'].isna()] = data[data['Created At'].isna()]['Created  At']


In [None]:
data = data.sort_values('Created At')

In [None]:
train = data[data['Created At']<'2022-03-31T00:00:00Z'][['Body','Entities']].fillna('Netural')
test = data[data['Created At']<'2022-03-31T00:00:00Z'][['Body','Entities']].fillna('Netural')

In [None]:
train['Body'][train['Entities'] == 'Netural']='Netural'
test['Body'][test['Entities'] == 'Netural']='Netural'

In [None]:
train

Unnamed: 0,Body,Entities
252417,Netural,Netural
252416,Netural,Netural
252415,$SPY $aapl $qqq Never bet on bears in bull mar...,Bullish
252414,"$AAPL After seeing today&#39;s price action, I...",Bullish
252413,Netural,Netural
...,...,...
200648,Netural,Netural
200647,$SPY $TSLA $AAPL $AMD $NVDA \nBuy when they&#3...,Bullish
200646,Netural,Netural
283992,Netural,Netural


In [None]:
train = train.reset_index()[['Body','Entities']]
test = test.reset_index()[['Body','Entities']]
train.columns = ['text', 'label']
test.column=['text', 'label']

  test.column=['text', 'label']


In [None]:
sample_size = train.groupby('label').agg('count').min().min()
temp = []
for l in train['label'].unique():
    temp.append(train[train['label']==l].sample(sample_size))

In [None]:
train1 = pd.concat(temp)

In [None]:
train1.groupby('label').agg('count')

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
Bearish,23094
Bullish,23094
Netural,23094


In [None]:
train1.to_csv('train.csv', sep="\t", encoding='utf-8')
test.to_csv('test.csv', sep="\t", encoding='utf-8')

# Tuning Finbert

# Sentiment analysis

In [None]:
stock_data[0].head()

Unnamed: 0,User,Body,Created At,Number of Likes,Entities,length
0,2129386,$AAPL 🛑this is Barash heading into tomorrow. ...,2023-10-05T20:46:57Z,0,Bearish,309
1,6529169,"$AAPL ... See y&#39;all @ $150.00!, cause of t...",2023-10-05T20:40:12Z,0,Bearish,67
2,128933,Apple $AAPL Rally Should Fail for More Downsid...,2023-10-05T20:39:04Z,0,,130
3,128933,$AAPL Ended the 3 waves bounce early last mont...,2023-10-05T20:38:36Z,0,,121
4,128933,Apple $AAPL 3 Waves Corrective Rally in Progre...,2023-10-05T20:37:13Z,0,,145


In [None]:
### LOAD MODEL FROM HERE
###
from torch.utils.data import SequentialSampler
# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("/content/drive/My Drive/Model/")
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

(…)nbert/resolve/main/tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

(…)/ProsusAI/finbert/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)ert/resolve/main/special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

(…)rosusAI/finbert/resolve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
#finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3).to('cuda')
#tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
#["Bearish", "Nay", "Neutral", "Bullish", "To the Moon!!"]

for data in tqdm(stock_data):
    sentences = list(data.Body)

    sentiment = []
    trainloader = DataLoader(sentences, batch_size=128,shuffle=False)
    with torch.no_grad():
        for sentence in trainloader:
            inputs = tokenizer(sentence, return_tensors="pt", padding=True).to('cuda')
            outputs = model(**inputs)[0]
            sentiment += (list(outputs.to('cpu').detach().numpy()))

    labels = {0:'Bearish', 1:'Nay',2:'Neutral', 3: 'Bullish', 4: 'To the Moon!!'}
    data[['Bearish', 'Nay','Neutral', 'Bullish', 'To the Moon!!']] = np.array(sentiment)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for data in tqdm(stock_data):


  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
import pickle

# Path to your Google Drive
drive_path = '/content/drive/My Drive/Ouput_Data'

# Serialize the data structures to a file in your Google Drive
with open(drive_path + 'Stock_comments_w_5_labels', 'wb') as file:
    pickle.dump((stock_data), file)

NameError: ignored

In [None]:
from google.colab import drive
import pickle
## remember to mount drive if you starting from this step without running from start of workbook
drive.mount("/content/drive")

drive_path = '/content/drive/My Drive/Model/'

# Deserialize the data structures from a file in your Google Drive
with open(drive_path + 'Stock_comments_w_5_labels', 'rb') as file:
    stock_data = pickle.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pickle

# Path to your Google Drive
drive_path = '/content/drive/My Drive/Ouput_Data'

# Serialize the data structures to a file in your Google Drive
with open(drive_path + 'Stock_comments_w_5_labels_raw', 'wb') as file:
    pickle.dump((data), file)

In [None]:
from google.colab import drive
import pickle
## remember to mount drive if you starting from this step without running from start of workbook
drive.mount("/content/drive")

drive_path = '/content/drive/My Drive/Ouput_Data'

# Deserialize the data structures from a file in your Google Drive
with open(drive_path + 'Stock_comments_w_5_labels_raw', 'rb') as file:
    data = pickle.load(file)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
stock_data[0].describe()

Unnamed: 0,User,Number of Likes,length,Bearish,Nay,Neutral,Bullish,To the Moon!!
count,244745.0,244745.0,244745.0,244745.0,244745.0,244745.0,244745.0,244745.0
mean,3203420.0,1.677428,125.713371,-1.112857,-0.235525,2.257067,0.479372,-0.778786
std,2316213.0,2.875766,133.942812,3.188108,3.108835,4.506053,3.020472,3.155307
min,5.0,0.0,11.0,-5.228049,-5.439459,-4.48357,-3.78802,-4.777293
25%,938543.0,0.0,43.0,-2.995966,-2.356609,-1.641636,-1.249198,-2.349573
50%,3129132.0,1.0,85.0,-2.511418,-1.734178,0.428851,-0.79601,-1.637302
75%,5440282.0,2.0,155.0,-0.584754,0.9908,8.206936,1.095239,-1.214846
max,8396416.0,81.0,1825.0,8.087673,8.383187,8.720236,7.981191,8.229541


In [None]:
data

Unnamed: 0.1,Unnamed: 0,User,Body,Created At,Number of Likes,Entities,length,Bearish,Nay,Neutral,Bullish,To the Moon!!
0,0,6835457,$PLTR this is bs manipulation,2023-10-19T16:36:44Z,0,,33,-1.607695,8.208226,-0.181158,-1.226183,-1.585690
1,1,5827747,$PLTR ahahah what a POS,2023-10-19T16:36:32Z,0,,27,-2.566195,-2.538597,8.475845,-0.906873,-1.630976
2,2,3918610,$PLTR Now what? Tired of watching this ...,2023-10-19T16:35:51Z,0,,54,-0.543568,7.125297,0.336588,-1.338916,-2.301708
3,3,3381181,$PLTR going under 17 today,2023-10-19T16:34:50Z,0,,30,-2.608138,-2.715280,8.394577,-1.027048,-1.509058
4,4,6300692,$PLTR THIS is not dovish,2023-10-19T16:30:46Z,0,,28,-2.632472,-2.539920,8.488290,-0.970468,-1.578984
...,...,...,...,...,...,...,...,...,...,...,...,...
224587,224587,4484586,"$PLTR \nThe short play here is over, th...",2021-12-31T23:31:24Z,4,,344,-4.126100,-1.415069,-2.233413,6.401759,1.443360
224588,224588,1214135,$PLTR Terrible investment. Insiders are a...,2021-12-31T23:29:19Z,2,Bearish,220,5.697704,2.047412,-3.512281,-1.720231,-3.308841
224589,224589,1050210,$PLTR tax loss selling,2021-12-31T23:23:03Z,3,,25,-1.914202,8.049525,0.150170,-1.073016,-1.610877
224590,224590,6147980,$PLTR picked some stock up after hours\n...,2021-12-31T23:14:54Z,2,,73,-2.746489,-2.433404,8.508695,-0.890457,-1.590274


In [None]:

### LOAD MODEL FROM HERE
###
from torch.utils.data import SequentialSampler
# Load the trained model and tokenizer
model = BertModel.from_pretrained("/content/drive/My Drive/Model/")
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')

# Use a GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
# 加载BERT模型和tokenizer
#model = BertModel.from_pretrained('yiyanghkust/finbert-tone').to('cuda')

from tqdm.notebook import tqdm  # Instead of from tqdm import tqdm

for data in stock_data:
    sentences = list(data.Body)
    vectors = []
    with torch.no_grad():
        for sentence in tqdm(sentences):
            inputs = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0).to(device)
            #inputs =  torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0).to('cuda')
            outputs = model(inputs)
            sentence_embedding = outputs.pooler_output
            vectors += (list(outputs.pooler_output.to('cpu').detach().numpy()))

        data['doc_vec'] = vectors


# # 将文本转换为BERT需要的输入格式
# input_ids = torch.tensor(tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)

# # 使用BERT模型提取文本特征
# outputs = model(input_ids)
# word_embeddings = outputs.last_hidden_state  # 每个单词的词向量
# sentence_embedding = outputs.pooler_output  # 整个句子的句子向量

  0%|          | 0/244745 [00:00<?, ?it/s]

  0%|          | 0/328075 [00:00<?, ?it/s]

  0%|          | 0/21011 [00:00<?, ?it/s]

  0%|          | 0/66088 [00:00<?, ?it/s]

  0%|          | 0/44887 [00:00<?, ?it/s]

  0%|          | 0/162477 [00:00<?, ?it/s]

  0%|          | 0/211341 [00:00<?, ?it/s]

In [None]:
### Stored the data in the following to make it easier to retrieve without running the model again which takes three hours

import pickle

# Path to your Google Drive
drive_path = '/content/drive/My Drive/Model/'

# Serialize the data structures to a file in your Google Drive
#with open(drive_path + 'data_with_doc_vec_data', 'wb') as file:
#    pickle.dump((data), file)

# Serialize the data structures to a file in your Google Drive
#with open(drive_path + 'data_with_doc_vec_stock_data', 'wb') as file:
#    pickle.dump((stock_data), file)

KeyboardInterrupt: ignored

In [None]:
#### pulling the data from the drive which has the doc_vec

from google.colab import drive
import pickle
## remember to mount drive if you starting from this step without running from start of workbook
drive.mount("/content/drive")

drive_path = '/content/drive/My Drive/Model/'

# Deserialize the data structures from a file in your Google Drive
with open(drive_path + 'data_with_doc_vec', 'rb') as file:
    data = pickle.load(file)

with open(drive_path + 'data_with_doc_vec_stock_data', 'rb') as file:
    stock_data = pickle.load(file)

Mounted at /content/drive


In [None]:
data

Unnamed: 0.1,Unnamed: 0,User,Body,Created At,Number of Likes,Entities,length,Bearish,Nay,Neutral,Bullish,To the Moon!!,doc_vec
0,0,6835457,$PLTR this is bs manipulation,2023-10-19T16:36:44Z,0,,33,-1.607695,8.208226,-0.181158,-1.226183,-1.585690,"[0.8979086, 0.6092003, -0.9132888, 0.60078824,..."
1,1,5827747,$PLTR ahahah what a POS,2023-10-19T16:36:32Z,0,,27,-2.566195,-2.538597,8.475845,-0.906873,-1.630976,"[-0.8116878, -0.79980576, 0.21781458, -0.91667..."
2,2,3918610,$PLTR Now what? Tired of watching this ...,2023-10-19T16:35:51Z,0,,54,-0.543568,7.125297,0.336588,-1.338916,-2.301708,"[0.95454097, 0.12431386, -0.9915885, 0.2530193..."
3,3,3381181,$PLTR going under 17 today,2023-10-19T16:34:50Z,0,,30,-2.608138,-2.715280,8.394577,-1.027048,-1.509058,"[-0.85530406, -0.78949904, 0.26928952, -0.9093..."
4,4,6300692,$PLTR THIS is not dovish,2023-10-19T16:30:46Z,0,,28,-2.632472,-2.539920,8.488290,-0.970468,-1.578984,"[-0.82666266, -0.8019852, 0.20136659, -0.91470..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
224587,224587,4484586,"$PLTR \nThe short play here is over, th...",2021-12-31T23:31:24Z,4,,344,-4.126100,-1.415069,-2.233413,6.401759,1.443360,"[0.6090245, 0.5644702, -0.8492736, 0.35177147,..."
224588,224588,1214135,$PLTR Terrible investment. Insiders are a...,2021-12-31T23:29:19Z,2,Bearish,220,5.697704,2.047412,-3.512281,-1.720231,-3.308841,"[0.8653008, 0.312287, 0.99745303, 0.072810195,..."
224589,224589,1050210,$PLTR tax loss selling,2021-12-31T23:23:03Z,3,,25,-1.914202,8.049525,0.150170,-1.073016,-1.610877,"[0.8941115, 0.5423311, -0.9510221, 0.584764, 0..."
224590,224590,6147980,$PLTR picked some stock up after hours\n...,2021-12-31T23:14:54Z,2,,73,-2.746489,-2.433404,8.508695,-0.890457,-1.590274,"[-0.8308524, -0.78778523, 0.20721395, -0.91306..."


In [None]:
stock_data

[           User                                               Body  \
 0       2129386  $AAPL  🛑this is Barash heading into tomorrow. ...   
 1       6529169  $AAPL ... See y&#39;all @ $150.00!, cause of t...   
 2        128933  Apple $AAPL Rally Should Fail for More Downsid...   
 3        128933  $AAPL Ended the 3 waves bounce early last mont...   
 4        128933  Apple $AAPL 3 Waves Corrective Rally in Progre...   
 ...         ...                                                ...   
 252413  5286511  Unusual Option Alert on $AAPL $1,519,560 call ...   
 252414  2018701  $AAPL After seeing today&#39;s price action, I...   
 252415  2018701  $SPY $aapl $qqq Never bet on bears in bull mar...   
 252416   152351  For the die-hard option traders, a gr8 way to ...   
 252417  5831506                                    $SPY $IWM $AAPL   
 
                   Created At  Number of Likes Entities  length   Bearish  \
 0       2023-10-05T20:46:57Z                0  Bearish     309 -2.07

In [None]:

## my way of checking the output of the code

for data in stock_data:
    sentences = list(data.Body)

vectors = []
print(sentences[0])

inputs = torch.tensor(tokenizer.encode(sentences[0], add_special_tokens=True)).unsqueeze(0).to(device)
print(inputs)
outputs = model(inputs)
print(outputs)

sentence_embedding = outputs.pooler_output
vectors += (list(outputs.pooler_output.to('cpu').detach().numpy()))

$PLTR  this  is  bs  manipulation


NameError: ignored

In [None]:
# import pickle
# file = open('stock_data.bin','wb')
# pickle.dump(stock_data, file)
# file.close()

In [None]:
# import pickle
# file = open('stock_data.bin','rb')
# stock_data = pickle.load(file)
# file.close()

# Novelty

https://www.kaggle.com/code/juanlu19/fork-of-2-sigma-news-eda-relacional-2