## Python dictionary workflow

Here we use the sentiment dictionary for the workflow, using Loughran-McDonald Master Dictionary w/ Sentiment Word Lists


https://sraf.nd.edu/loughranmcdonald-master-dictionary/

In [39]:
import pandas as pd
import numpy as np

In [40]:
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from nltk.tokenize import sent_tokenize, word_tokenize

In [41]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [42]:
import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
# get the stop words here

# download nltk package
nltk.download('stopwords')
nltk.download('punkt')


# add other stop words to fine tune the relevant model, use this for the other workflows here!!
stopword=set(stopwords.words('english') + []) # add the other stop words here

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peterhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/peterhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
def data_cleaner(text, return_tokens = False):
    '''
    Cleans the data from special characters, urls, punctuation marks, extra spaces.
    Removes stopwords (Like if, it, the etc) and transforms the word in its native
    form using Porter Stemmer.
    '''
    text = str(text).lower() # lowercase the string
    text = re.sub('\[.*?\]', ' ', text) # replace punctuation with whitespaces.
    text = re.sub('https?://\S+|www\.\S+', ' ', text) # replacing urls with whitespaces.
    text = re.sub('<.*?>+', ' ', text) # removes special characters
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) # removes punctuation
    text = re.sub('\r', ' ', text) # removes new line characters
    text = re.sub('\n', ' ', text) # removes new line characters
    text = re.sub('\w*\d\w*', ' ', text)
    #text = re.sub('–', ' ', text) # remove any additional characters we cannot remove
    text = re.sub('[–£…»]', ' ', text) # remove any additional characters we cannot remove
    text = text.split()

    # removing stopwords,
    text = [word for word in text if not word in stopword ]

    # stemming.
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text]

    if return_tokens:

        # return relevant tokens here where needed
        return text

    #List to string.
    text = ' '.join(text)

    return text

In [45]:
sentiment_words_df = pd.read_csv("sentiment_dictionary/Loughran-McDonald_MasterDictionary_1993-2023.csv")
sentiment_words_df.head()

Unnamed: 0,Word,Seq_num,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Strong_Modal,Weak_Modal,Constraining,Complexity,Syllables,Source
0,AARDVARK,1,664,2.69e-08,1.86e-08,4.05e-06,131,0,0,0,0,0,0,0,0,2,12of12inf
1,AARDVARKS,2,3,1.21e-10,8.23e-12,9.02e-09,1,0,0,0,0,0,0,0,0,2,12of12inf
2,ABACI,3,9,3.64e-10,1.11e-10,5.16e-08,7,0,0,0,0,0,0,0,0,3,12of12inf
3,ABACK,4,29,1.17e-09,6.33e-10,1.56e-07,28,0,0,0,0,0,0,0,0,2,12of12inf
4,ABACUS,5,9349,3.79e-07,3.83e-07,3.46e-05,1239,0,0,0,0,0,0,0,0,3,12of12inf


In [74]:
sentiment_words_df.columns

Index(['Word', 'Seq_num', 'Word Count', 'Word Proportion',
       'Average Proportion', 'Std Dev', 'Doc Count', 'Negative', 'Positive',
       'Uncertainty', 'Litigious', 'Strong_Modal', 'Weak_Modal',
       'Constraining', 'Complexity', 'Syllables', 'Source'],
      dtype='object')

In [46]:
# extract positive and negative words for workflow

In [54]:
# sentiment_words_df[sentiment_words_df['Positive'] > 0]['Word'].apply(data_cleaner, return_tokens=False)


125             abl
336           abund
338           abund
438         acclaim
477      accomplish
            ...    
85305           win
85427        winner
85428        winner
85429           win
85964        worthi
Name: Word, Length: 347, dtype: object

In [55]:
# Extract sentiment word lists, convert it into data dictionary here
positive_words = sentiment_words_df[sentiment_words_df['Positive'] > 0]['Word'].apply(data_cleaner, return_tokens=False)

positive_words = set(positive_words)

negative_words = sentiment_words_df[sentiment_words_df['Negative'] > 0]['Word'].apply(data_cleaner, return_tokens=False)

negative_words = set(negative_words)

In [48]:
positive_words

125             [abl]
336           [abund]
338           [abund]
438         [acclaim]
477      [accomplish]
             ...     
85305           [win]
85427        [winner]
85428        [winner]
85429           [win]
85964        [worthi]
Name: Word, Length: 347, dtype: object

In [56]:
def preprocess_document(doc):
    doc = doc.lower()
    doc = re.sub(r'[^a-z\s]', '', doc)
    tokens = doc.split()
    return tokens

To improve, you can preprocess the tokens and code in the same way, and then use those words to preprocess the below.

Use the same LDA preprocessing on both to get the sentiments as seen below.

In [59]:
def calculate_sentiment(doc_tokens, positive_words, negative_words):
    # tokens = preprocess_document(doc)
    positive_count = sum(1 for word in doc_tokens if word in positive_words)
    negative_count = sum(1 for word in doc_tokens if word in negative_words)
    total_words = len(doc_tokens)
    
    sentiment_score = (positive_count - negative_count) / total_words if total_words else 0
    return sentiment_score

In [61]:
documents = [
    "The company had a great quarter with significant growth.",
    "There were many challenges and losses in the last quarter."
]

# Calculate and print sentiment scores
for doc in documents:

    preprocessed_doc = data_cleaner(doc, return_tokens=True)

    score = calculate_sentiment(preprocessed_doc, positive_words, negative_words)
    print(f'Document: {doc}\nSentiment Score: {score}\n')

Document: The company had a great quarter with significant growth.
Sentiment Score: 0.2

Document: There were many challenges and losses in the last quarter.
Sentiment Score: -0.4



The preprocessing, and the code above, seems to make sense.

So we can run this sentiment of words, but we can also try to use the other code workflows as well.

In [None]:
# use this to count the number of words

In [63]:
annual_report_df = pd.read_json("raw_data/sec_us_phrama_all_company_filing_meta_with_text_w_7_7A_2011_2023.jsonl", lines = True)

In [64]:
annual_report_df.columns

Index(['ticker', 'formType', 'accessionNo', 'cik', 'companyNameLong',
       'companyName', 'linkToFilingDetails', 'description', 'linkToTxt',
       'filedAt', 'documentFormatFiles', 'periodOfReport', 'entities', 'id',
       'seriesAndClassesContractsInformation', 'linkToHtml', 'linkToXbrl',
       'dataFiles', 'effectivenessDate', 'Text_7', 'Text_7A'],
      dtype='object')

In [65]:
section7_text_cleaned = annual_report_df["Text_7"].parallel_apply(data_cleaner, return_tokens=True)

In [73]:
annual_report_df["Text_7"]

# it is right in the below, as you've got the same result so make sure you do this!!

0        Item 7. MANAGEMENT&#8217;S DISCUSSION AND ANA...
1        Item 7. MANAGEMENT&#8217;S DISCUSSION AND ANA...
2        Item 7. MANAGEMENT&#8217;S DISCUSSION AND ANA...
3        Item 7. MANAGEMENT&#8217;S DISCUSSION AND ANA...
4                                                        
                              ...                        
8376     Item 7. Management&#8217;s Discussion and Ana...
8377     Item 7. Management&#8217;s Discussion and Ana...
8378     Item 7. Management&#8217;s Discussion and Ana...
8379     Item 7. Management&#8217;s Discussion and Ana...
8380     Item 7. Management&#8217;s Discussion and Ana...
Name: Text_7, Length: 8381, dtype: object

In [66]:
annual_report_df["section7_cleaned"] = section7_text_cleaned

In [67]:
annual_report_df["section7_cleaned"]

0       [item, manag, discuss, analysi, financi, condi...
1       [item, manag, discuss, analysi, financi, condi...
2       [item, manag, discuss, analysi, financi, condi...
3       [item, manag, discuss, analysi, financi, condi...
4                                                      []
                              ...                        
8376    [item, manag, discuss, analysi, financi, condi...
8377    [item, manag, discuss, analysi, financi, condi...
8378    [item, manag, discuss, analysi, financi, condi...
8379    [item, manag, discuss, analysi, financi, condi...
8380    [item, manag, discuss, analysi, financi, condi...
Name: section7_cleaned, Length: 8381, dtype: object

In [70]:
annual_report_df["section7_cleaned_sentiment"] = annual_report_df["section7_cleaned"].parallel_apply(calculate_sentiment, positive_words = positive_words, negative_words = negative_words)

In [71]:
# Count positive values
positive_count = (annual_report_df["section7_cleaned_sentiment"] > 0).sum()

# Count negative values
negative_count = (annual_report_df["section7_cleaned_sentiment"]  < 0).sum()

# Count zero values
zero_count = (annual_report_df["section7_cleaned_sentiment"]  == 0).sum()

In [72]:
positive_count, negative_count, zero_count

(np.int64(956), np.int64(5496), np.int64(1929))

### Deal with uncertainty as required

These are the words that G. used previously.

In [78]:
# define the uncertainty words by the word list, same procedure as before.

uncertainty_words =  sentiment_words_df[sentiment_words_df['Uncertainty'] > 0]['Word'].apply(data_cleaner, return_tokens=False)

uncertainty_words = set(uncertainty_words)

In [79]:
def count_uncertainty_words(words, uncertainty_words):
    count = sum(1 for word in words if word in uncertainty_words)
    return count

In [80]:
def calculate_uncertainty_score(text, uncertainty_words):
    words = data_cleaner(text, return_tokens=True)
    uncertainty_count = count_uncertainty_words(words, uncertainty_words)
    # Normalize by the total number of words to get the score
    score = uncertainty_count / len(words) if words else 0
    return score

In [81]:
text = "I am very uncertain about the my future financial prospects"
uncertainty_score = calculate_uncertainty_score(text, uncertainty_words)
print(f'Uncertainty Score: {uncertainty_score}')

Uncertainty Score: 0.25


In [None]:
# kind of works - just use basic score words and this could work well.

# expand and do it

Try different measurements of scores for the procedure here that's it!!


Try other drafts for the paper for submission later!!