<a href="https://colab.research.google.com/github/Volyzte/FinancialTextSentimentTools/blob/main/FinancialTextSentimentTools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import string
import nltk
nltk.download('punkt')

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
class LMAnalysisTools:
  def __init__(self):
    self.positive_words_dict = self.load_positive_words()
    self.negative_words_dict = self.load_negative_words()

  def load_positive_words(self):
    """
    Loads positive word list from LM Master Dictionary
    """
    lm_pos_words = []
    with open('Loughran-McDonald_MasterDictionary_1993-2023.csv', mode='r') as csv_file:
      csv_reader = csv.DictReader(csv_file)

      for row in csv_reader:
        if row["Positive"] != "0":
          lm_pos_words += [row["Word"]]

    return lm_pos_words

  def load_negative_words(self):
    """
    Loads negative word list from LM Master Dictionary
    """
    lm_neg_words = []
    with open('Loughran-McDonald_MasterDictionary_1993-2023.csv', mode='r') as csv_file:
      csv_reader = csv.DictReader(csv_file)

      for row in csv_reader:
        if row["Negative"] != "0":
          lm_neg_words += [row["Word"]]

    return lm_neg_words

  def binary_search(self, word_list, target_word):
    """
    Searches for an English word in a inputted list
    Args:
      word_list: a list of English words
      target_word: the word that is searched on the word_list
    """
    left = 0
    right = len(word_list) - 1

    while left <= right:
        mid = (left + right) // 2
        mid_word = word_list[mid]

        if mid_word == target_word:
            return mid  # Word found, return its index
        elif mid_word < target_word:
            left = mid + 1
        else:
            right = mid - 1

    return None  # Word not found

  def process_sentence(self, sentence):
    """
    Separates words of a sentences and puts them in a list in uppercase format
    Args:
      sentence:
    """
    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    sentence = sentence.translate(translator)

    # Split the sentence into words and convert them to uppercase
    words = [word.upper() for word in sentence.split()]

    return words

  def lm_sentiment_analysis(self, word_array):
    """
    Outputs a string (positive, negative, or neutral) based on the LM sentiment analysis of the inputted string
    Args:
      sentence:
    """
    score = {'positive': 0, 'negative': 0}

    for i in word_array:
      if self.binary_search(self.positive_words_dict, i) != None:
        score['positive'] += 1
      elif self.binary_search(self.negative_words_dict, i) != None:
        score['negative'] += 1

    if score['positive'] >= 1 and score['negative'] == 0:
      return "positive"
    elif score['negative'] >= 1:
      return "negative"
    else:
      return "neutral"

In [None]:
lmTools = LMAnalysisTools()

In [None]:
class ProspectusContents:
  def __init__(self, fileName):
    self.contents = fileName
    self.contents_as_string = self.read_contents_as_string()
    self.sentences = self.separate_sentences()
    self.letters = self.count_letters()
    self.words = self.count_words()
    self.sentence_count = self.count_sentences()
    self.readability = self.calculate_coleman_liau_index()

  def read_contents_as_string(self):
    try:
        with open(self.contents, 'r') as file:
            file_content = file.read()
        return file_content
    except FileNotFoundError:
        return "File not found"
    except Exception as e:
        return f"An error occurred: {str(e)}"

  def separate_sentences(self):
    try:
      sentences = nltk.sent_tokenize(self.contents_as_string)
      return sentences
    except Exception as e:
      return f"An error occurred: {str(e)}"

  def calculate_coleman_liau_index(self):
    L = (self.letters / self.words) * 100
    S = (self.sentence_count / self.words) * 100
    index = 0.0588 * L - 0.296 * S - 15.8
    return index

  def count_letters(self):
    return sum(1 for char in self.contents_as_string if char.isalpha())

  def count_words(self):
    return len(self.contents_as_string.split())

  def count_sentences(self):
    sentence_endings = ['.', '!', '?']
    return sum(1 for char in self.contents_as_string if char in sentence_endings)

  def finbert_sentiment_count(self):
    positive_count = 0
    negative_count = 0
    neutral_count = 0

    for sentence in self.sentences:
      results = nlp(sentence)
      if results[0]['label'] == 'Positive':
        positive_count+=1
      elif results[0]['label'] == 'Negative':
        negative_count+=1
      else:
        neutral_count+=1

    print(f"Count of FinBERT Positive sentences: {positive_count}")
    print(f"Count of FinBERT Negative sentences: {negative_count}")
    print(f"Count of FinBERT Neutral sentences: {neutral_count}")

  def lm_sentiment_count(self):
    """
    Prints a log that shows the count of all positive, negative, and neutral sentences of a given txt file
    Args:
      sentence:
    """

    positive_count = 0
    negative_count = 0
    neutral_count = 0

    for i in self.sentences:
      results= lmTools.lm_sentiment_analysis(lmTools.process_sentence(i))

      if results == "positive":
        positive_count+=1
      elif results == "negative":
        negative_count+=1
      else:
        neutral_count+=1


    print(f"Count of LM Positive sentences: {positive_count}")
    print(f"Count of LM Negative sentences: {negative_count}")
    print(f"Count of LM Neutral sentences: {neutral_count}")
