## SEC Sentiment Analysis

In [1]:
%load_ext autoreload
%autoreload 2

### Grab 10-K filings from EDGAR

In [2]:
import os

import tqdm

from fetch import get_form_by_ticker

In [3]:
tickers = ['ehc', 'mrk','nke', 'msex', 'v', 'cvs', 'doc', 'smtc', 'cl', 
'ava', 'bc', 'f', 'lmt', 'cri', 'aig', 'rgld', 'apld', 'omcl', 
'mmm', 'bgs', 'dis','wetg', 'bj']

In [4]:
cwd = os.getcwd()
data_directory = os.path.join(cwd, "xbrl-forms")
if not os.path.exists(data_directory):
    os.mkdir(data_directory)

In [5]:
forms = []
for ticker in tqdm.tqdm(tickers): 
    form_text = get_form_by_ticker(
        ticker=ticker,
        form_type="10-K",
        company="Unstructured Technologies",
        email="support@unstructured.io"
    )
    
    filename = os.path.join(data_directory, f"{ticker}-10k.xbrl")
    with open(filename, "w") as f:
        f.write(form_text)

100%|██████████████████████████████████████████████████████████████████████| 23/23 [00:28<00:00,  1.24s/it]


### Extract the Risk Factors Narrative

In [6]:
import requests
import time

In [7]:
url = "https://api.unstructured.io/sec-filings/v0.1.0/section"

In [8]:
risk_factors = dict()
for ticker in tqdm.tqdm(tickers):
    response = requests.post(
        url,
        files={"file": open(f"./xbrl-forms/{ticker}-10k.xbrl", "rb")},
        data={"section": ["RISK_FACTORS"]},
    )
    response.raise_for_status()
    risk_factors[ticker] = response.json()["RISK_FACTORS"]
    time.sleep(1)

100%|██████████████████████████████████████████████████████████████████████| 23/23 [07:42<00:00, 20.12s/it]


### Stage for LabelStudio

In [10]:
from unstructured.staging.base import isd_to_elements

In [11]:
elements = []
for sections in risk_factors.values():
    elements.extend(isd_to_elements(sections))

In [12]:
from transformers import pipeline

In [13]:
model = "distilbert-base-uncased-finetuned-sst-2-english"
sentiment_pipeline = pipeline(model=model)

Downloading:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

In [19]:
from unstructured.staging.label_studio import (
    stage_for_label_studio,
    LabelStudioAnnotation,
    LabelStudioResult,
)

In [39]:
annotations = []
for element in tqdm.tqdm(elements):
    inference = sentiment_pipeline(element.text, truncation=True)
    result = [LabelStudioResult(
              type="choices",
              value={"choices": [inference[0]["label"].title()]},
              from_name="sentiment",
              to_name="text",
    )]
    annotations.append([LabelStudioAnnotation(result=result)])

100%|██████████████████████████████████████████████████████████████████| 2499/2499 [01:22<00:00, 30.30it/s]


In [40]:
label_studio_data = stage_for_label_studio(
    elements=elements,
    annotations=annotations,
)

In [41]:
import json

In [42]:
with open("sec-sentiment-analysis.json", "w") as f:
    json.dump(label_studio_data, f, indent=4)

### Train a Sentiment Model

In [47]:
with open("sec-sentiment-analysis-labeled.json", "r") as f:
    training_data = json.load(f)

In [77]:
datasets_data = Dataset.from_dict({
    "text": [item["text"] for item in training_data],
    "label": [0 if item["sentiment"] == "Negative" else 1 
             for item in training_data]
})

In [88]:
model_name = "distilbert-base-uncased"

In [89]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2
)


loading configuration file config.json from cache at /Users/mrobinson/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}



Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /Users/mrobinson/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/pytorch_model.bin
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of

In [90]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /Users/mrobinson/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /Users/mrobinson/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/043235d6088ecd3dd5fb5ca3592b6913fd516027/vocab.txt
loading file tokenizer.json from cache at /Users/mrobinson/.cache/huggingface/hub/model

In [91]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

In [92]:
tokenized_train = datasets_data.map(preprocess_function, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

In [93]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [94]:
from transformers import Trainer

In [95]:
trainer = Trainer(
   model=model,
   train_dataset=tokenized_train,
   tokenizer=tokenizer,
   data_collator=data_collator,
)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [96]:
trainer.train()

***** Running training *****
  Num examples = 2499
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 939
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.2336


Saving model checkpoint to tmp_trainer/checkpoint-500
Configuration saved in tmp_trainer/checkpoint-500/config.json
Model weights saved in tmp_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-500/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=939, training_loss=0.16437377360936045, metrics={'train_runtime': 1133.0021, 'train_samples_per_second': 6.617, 'train_steps_per_second': 0.829, 'total_flos': 490623281792892.0, 'train_loss': 0.16437377360936045, 'epoch': 3.0})

In [97]:
dir(trainer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_gather_and_numpify',
 '_get_collator_with_removed_columns',
 '_get_eval_sampler',
 '_get_learning_rate',
 '_get_train_sampler',
 '_globalstep_last_logged',
 '_hp_search_setup',
 '_inner_training_loop',
 '_load_best_model',
 '_load_from_checkpoint',
 '_load_optimizer_and_scheduler',
 '_load_rng_state',
 '_loggers_initialized',
 '_maybe_log_save_evaluate',
 '_memory_tracker',
 '_move_model_to_device',
 '_nested_gather',
 '_pad_across_processes',
 '_prepare_input',
 '_prepare_inputs',
 '_push_from_checkpoint',
 '_remove_unused_columns',
 '_report_to_hp_search',
 '_rotate_checkpoints',
 '_save',
 '_save_checkpoint',
 '_save

In [98]:
trainer.save_model("sec-sentiment-model")

Saving model checkpoint to sec-sentiment-model
Configuration saved in sec-sentiment-model/config.json
Model weights saved in sec-sentiment-model/pytorch_model.bin
tokenizer config file saved in sec-sentiment-model/tokenizer_config.json
Special tokens file saved in sec-sentiment-model/special_tokens_map.json


In [110]:
sec_sentiment_model = pipeline(
task="sentiment-analysis",
model="./sec-sentiment-model",
)

loading configuration file ./sec-sentiment-model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./sec-sentiment-model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading configuration file ./sec-sentiment-model/config.json
Model config DistilBertConfig {
  "_name_or_path": "./sec-sentiment-model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hid

In [111]:
sec_sentiment_model(elements[0].text)

[{'label': 'LABEL_0', 'score': 0.9984645843505859}]