**We use ProsusAI/finbert model from the transformers library in order to perform sentiment analysis on the 8K text**

In [1]:
!pip install transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification
  
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 46.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 34.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 3.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 16.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

In [2]:
import torch
import pandas as pd
import json

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -s /content/gdrive/My\ Drive/ /mydrive  #This is to make a short form for the mydrive location 

Mounted at /content/gdrive


In [4]:
def findSentiment(sentence):
  input = tokenizer(sentence, padding = True, truncation = True,  return_tensors='pt')
  outputs = model(**input)
  prediction = torch.nn.functional.softmax(outputs.logits, dim=-1) # [positive,negative,neutral]
  return prediction[0][0].item() , prediction[0][1].item() , prediction[0][2].item()


In [5]:
sentence = "Career education expects first-quarter revenue to rise 44%. graduate program segment revenue expected to grow by 14% to $118.5 million"
positive , negative , neutral = findSentiment(sentence)
print(positive,negative,neutral)

0.9514992833137512 0.016624964773654938 0.031875722110271454


In [6]:
years = [2020,2021]

companies = pd.read_csv("/content/GoodCom.csv")
ciks = companies["CIK"].astype(int).tolist()

cluster_features = [
['investors', 'stock', 'outstanding', 'demand', 'tribute'],
['acquire', 'secure', 'acquired', 'secured', 'agreement'], 
['revenue' ,'growth', 'profit', 'profitability', 'generating', 'generation', 'ebidta', 'earning', 'earnings', 'pay back','loss','debt'], 
['statement', 'statements', 'filed', 'report', 'reports', 'result', 'results'],
['dollar','hundred','thousand','million','billion','%','$'] ]

#We use the cluster features in order to cluster the sentences into 5 types of clusters
#Each cluster represents a particular type of financial statement

clustered_8k = dict()

for cik in ciks:

    year_dict = dict()

    for year in years:

        try:
            
            fhand = open(f'/mydrive/8K_Sentence_Files/{cik}_{year}.txt',encoding='utf-8')

            cluster_dict = dict()
            for i in range(len(cluster_features)):
                cluster_dict[i] = []
            
            for line in fhand:
                line = line.strip()
                line = line.lower()

                if ('common' in line) and ('stock' in line) and ('par' in line) and ('value' in line) and ('share' in line):
                    continue

                assigned = False
                for i in range(len(cluster_features)):
                    if assigned:
                        break
                    features = cluster_features[i]
                    for feature in features:
                        if feature in line:
                            line = line.capitalize()
                            positive , negative , neutral = findSentiment(line)
                            #For each sentence, we find its sentiment using the finbert model sentiment = ""
                            if positive>=0.7:
                              sentiment = "Positive"
                            elif negative>=0.7:
                              sentiment = "Negative"
                            else:
                              sentiment = "Neutral"
                            
                            cluster_dict[i].append((line,sentiment))
                            assigned = True
                            break

            for i in range(len(cluster_features)):
                cluster_dict[i] = list(set(cluster_dict[i]))
            
            year_dict[year] = cluster_dict
        except Exception as e:
            print(cik," ",year," ",e)
            continue

    clustered_8k[cik] = year_dict

    
with open(f"bulk_res.json", 'w') as f:
    json.dump(clustered_8k, f, indent=4)