In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import numpy as np
from datetime import datetime
import yfinance as yf
import os
import glob
import regex as re
import csv
import statistics

In [2]:
def get_transcript(path):
    mytranscript = pd.read_csv(path).iloc[[2]].values[0][0] 
    mytranscript = re.sub(r'[^A-Za-z0-9.,:!\'\n ]', '', mytranscript)
    mytranscript = mytranscript.replace(".", ". ")
    mytranscript = re.sub('[^\S\n]+', ' ', mytranscript) #replaces multiple spaces to single space, without deleting newlines \n in the process
    mytranscript = mytranscript.splitlines() # finds transcript
    return mytranscript

def split_transcript(mytranscript):    
    transcript_safe_harbour, transcript_questions = "", ""
    for i in range(0, len(mytranscript)):
        speech_bubble = mytranscript[i].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space so the IF condition below can run smoothly
        # finds the following condition (what operator says) and splits the transcript into 2)
        if (i > 1) and (("operator:" in speech_bubble) and (("question" in speech_bubble) or ("go ahead" in speech_bubble) or ("operator instructions" in speech_bubble))):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break
        elif (i > 1 ) and ("operator" in speech_bubble) and ("question" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i+1]
            transcript_questions = mytranscript[i+1:]
            break
        elif (i > 1 ) and ("operator:" in speech_bubble) and ("first" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break

    return transcript_safe_harbour, transcript_questions

def get_file_speaker_names(sector, stock):
    write_path = "sectors/"+sector+"/"+stock+"/"+"speaker names.csv"
    speaker_names = np.loadtxt(write_path, delimiter='\t', dtype=str)
    return speaker_names
    
# finds a list of analyst names for a single .csv file
def find_analyst_names(speaker_names, transcript_questions):
    analyst_names = []
    # the programme recognises the question is being asked by an analyst when the following conditions are met:
    for index in range(0, len(transcript_questions)-1):
        speech_bubble = transcript_questions[index].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
        if ("operator:" in speech_bubble) or ("operator :" in speech_bubble):
            for name in speaker_names:
                namelist = name.split()
                if (name.lower() != "operator") and ("representative" not in name.lower()) and ("corporate" not in name.lower()) and ("company" not in name.lower()):
                    for name_2 in namelist: # cycle through each name in the name_list
                        name_2 = name_2.lower()
                        # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
                        # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
                        # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
                        if (((" "+name_2+" " in speech_bubble) and len(name_2) > 2) and (("end" not in speech_bubble) and ("closing" not in speech_bubble) and (("turn" not in speech_bubble) or (("over" not in speech_bubble))))) and (name_2 in transcript_questions[index+1].lower()):
                            analyst_names.append(name)
                    if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
                        analyst_names.append(name) 
                        
    analyst_names = list(set(analyst_names)) # replaces duplicates        
    return analyst_names

def get_analyst_management_sentences(analyst_names, transcript_questions):
    analyst_sentences = []
    management_sentences = []

    # get analyst sentence
    for index in range(0, len(transcript_questions)-1):
        speech_bubble = transcript_questions[index]

        #finds the name of the speaker
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        if index > 3:
            for name in analyst_names:
                namelist = name.split()
                if (speech_bubble not in analyst_sentences):
                    # checks if the name of the current speaker is in the "analyst_names" list, if it is, then this speaker is considered as an analyst
                    if speaker_name in analyst_names:
                        analyst_sentences.append(speech_bubble)

                    elif (speaker_name.lower() == "operator") or (speaker_name.lower() == "operator "):
                        pass

                    elif ((namelist[0] in speaker_name) or (namelist[-1] in speaker_name)) and (("operator:" in transcript_questions[index-1].lower()) or ("operator :" in transcript_questions[index-1].lower())):
                        # in the case where:
                        # Operator: [Operator Instructions]Our first question is from Jeffrey Van Sinderen with B. Riley FBR. Please proceed.
                        # JeffreySinderen: Good morning, everyone. Can you speak a little bit more about the sales progression you've seen in China...
                        # Jeffrey Van Sindere is an analyst, but is referenced as JeffreySinderen in the text, the previous find_analyst_names() function did not pick up this
                        # However, now the name "JeffreySinderen" is registered as an analyst name through this function.
                        analyst_names.append(speaker_name)
                        analyst_sentences.append(speech_bubble)

    # get management sentence
    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index]
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        
        # dont want operator's sentence 
        if (speaker_name.lower() == "operator") or (speaker_name.lower() == "operator "):
            pass
        
        elif (speech_bubble not in analyst_sentences):
            management_sentences.append(speech_bubble)
    return analyst_sentences, management_sentences


In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_word = False
        self.fail = None
        self.word = None

class AhoCorasick:
    def __init__(self, words):
        self.root = TrieNode()
        self.build_trie(words)
        self.build_ac_automata()

    def build_trie(self, words):
        for word in words:
            node = self.root
            for char in word:
                if char not in node.children:
                    node.children[char] = TrieNode()
                node = node.children[char]
            node.is_word = True
            node.word = word

    def build_ac_automata(self):
        queue = []

        for node in self.root.children.values():
            queue.append(node)
            node.fail = self.root

        while len(queue) > 0:
            node = queue.pop(0)
            for char, child in node.children.items():
                queue.append(child)
                fail_node = node.fail
                while fail_node is not None and char not in fail_node.children:
                    fail_node = fail_node.fail
                if fail_node is None:
                    child.fail = self.root
                else:
                    child.fail = fail_node.children[char]
                child.is_word |= child.fail.is_word

    def remove_words(self, text):
        node = self.root
        new_text = text
        for i, char in enumerate(text):
            while node is not None and char not in node.children:
                node = node.fail
            if node is None:
                node = self.root
                continue
            node = node.children[char]
            if node.is_word:
                new_text = new_text.replace(node.word, '')
        return new_text


# Usage
list1 = ['string1', 'string2', 'string3']
sentence = "string1 is string2 is string is string3 is string"
ac = AhoCorasick(list1)
new_sentence = ac.remove_words(sentence) # deletes a particular string from new_sentence if that string is presnet in list1
new_sentence = re.sub('[^\S\n]+', ' ', new_sentence)
new_sentence

' is is string is is string'

In [None]:
!pip install --upgrade "protobuf<=3.20.1"

In [4]:
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import numpy as np
import textstat

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

sentiment_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
sentiment_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

sentiment_nlp = pipeline("text-classification", model=sentiment_finbert, tokenizer=sentiment_tokenizer)


In [None]:
mystr = "I mean, I can comment a little bit about it. I mean, the corridor that we did very well in with Cuba and there is a I don't know how else to explain it, but there's a black market currency and a regular currency. And people are basically choosing to do business in cash in Cuba because they can buy way more on the black market versus paying for things here, where we have to obviously not do that and that's really the situation. And it's and again, it's not just for us, it's for all of our competitors as well. They are all seeing the same deterioration."
result = sentiment_nlp(mystr)
result

In [5]:
import nltk
nltk.download('punkt')

def map_sentiments(sentiment_result):
    sentiment_result = sentiment_result[0]
    if sentiment_result['label'] == 'Negative':
        return -1 * sentiment_result['score'], "negative"
    
    elif sentiment_result['label'] == 'Neutral':
        return 0, "neutral"
    
    elif sentiment_result['label'] == 'Positive':
        return sentiment_result['score'], "positive"


def split_paragraph_into_sentences(temp):
    sentences = nltk.sent_tokenize(temp)
    return sentences

def get_NLP_values(liststr):
    # further analysis includes finding sentiment and word complexity.
    if len(liststr) == 0:
        return 0, 0, 0
    
    else:
        # maps sentiment data so it outputs a single sentiment value
        sentiment_result = sentiment_nlp(liststr)
        # gets 
        sentiment_score = map_sentiments(sentiment_result)

        # word complexity:
        flesch_score = textstat.flesch_reading_ease(liststr)
        gunning_fog_score = textstat.gunning_fog(liststr)

        return sentiment_score, flesch_score, gunning_fog_score
        
# FLS classification
fls_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
fls_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
fls_nlp = pipeline("text-classification", model=fls_finbert, tokenizer=fls_tokenizer)

[nltk_data] Downloading package punkt to /Users/victor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific

def getFeature5to22(pre_release, speaker_names):
    new_speaker_names = [word + ':' for word in speaker_names]

    ac = AhoCorasick(new_speaker_names)

    net_sentiment_list = []
    flesch_list = []

    n_flslist = []
    s_flslist = []
    ns_flslist = []

    net_positive = 0
    net_negative = 0
    net_neutral = 0

    feature_extract_5 = 0
    feature_extract_6 = 0
    feature_extract_7 = 0
    feature_extract_8 = 0
    feature_extract_9 = 0
    feature_extract_10 = 0
    feature_extract_11 = 0
    feature_extract_12 = 0
    feature_extract_13 = 0
    feature_extract_14 = 0
    feature_extract_15 = 0
    feature_extract_16 = 0
    feature_extract_17 = 0
    feature_extract_18 = 0
    feature_extract_19 = 0
    feature_extract_20 = 0
    feature_extract_21 = 0
    feature_extract_22 = 0

    try:
        for speech_bubble in pre_release:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                flesch_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                fls_results = fls_nlp(new_speech_bubble_list)
                
                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    if fls_results[i]['label'] == 'Not FLS':
                        n_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Specific FLS':
                        s_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Non-specific FLS':                    
                        ns_flslist.append(sentence) 
                        
            except:
                pass

        try:
            feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, fls1_sentiment_list, net1_positive, net1_negative, net1_neutral = get_fls_features(s_flslist)
            
        except:
            pass

        try:
            feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, fls2_sentiment_list, net2_positive, net2_negative, net2_neutral = get_fls_features(ns_flslist)

        except:
            pass

        try:
            feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, fls3_sentiment_list, net3_positive, net3_negative, net3_neutral = get_fls_features(n_flslist)

        except:
            pass

        try:
            numb_s_flslist = len(s_flslist)
            numb_ns_flslist = len(ns_flslist)
            numb_n_flslist = len(n_flslist)
            total = numb_s_flslist + numb_ns_flslist + numb_n_flslist

            feature_extract_21 = numb_s_flslist/total
            feature_extract_22 = numb_ns_flslist/total

        except:
            pass
        
        try:
            net_positive = net1_positive + net2_positive + net3_positive
            net_negative = net1_negative + net2_negative + net3_negative
            net_neutral = net1_neutral + net2_neutral + net3_neutral
            net_sentiment_list = fls1_sentiment_list + fls2_sentiment_list + fls3_sentiment_list
        except:
            pass

        try:
            feature_extract_5 = statistics.mean(net_sentiment_list)
        except:
            pass

        try:
            feature_extract_6 = net_positive/(net_negative+net_positive+net_neutral)
            feature_extract_7 = net_negative/(net_negative+net_positive+net_neutral)
        except:
            pass

        try:
            feature_extract_8 = statistics.mean(flesch_list)
        except:
            pass

        fea_ext_list5to22 = [feature_extract_5, feature_extract_6, feature_extract_7, feature_extract_8, feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, feature_extract_21, feature_extract_22]

    except:
        fea_ext_list5to22 = [0]*18
    
    return fea_ext_list5to22


def get_fls_features(flslist):
    fls_sentiment_list = []
    net_positive = 0
    net_negative = 0
    net_neutral = 0
    
    feature_extract_1 = 0
    feature_extract_2 = 0
    feature_extract_3 = 0
    feature_extract_4 = 0

    for each_fls in flslist:
        sentiment_result = sentiment_nlp(each_fls)
        sentiment_score, positivity_value = map_sentiments(sentiment_result)
        fls_sentiment_list.append(sentiment_score)
        
        if positivity_value == "positive":
            net_positive += 1

        elif positivity_value == "negative":
            net_negative += 1

        else:
            net_neutral += 1
    try:
        feature_extract_1 = statistics.mean(fls_sentiment_list)
    except:
        pass
    try:
        feature_extract_2 = net_positive/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_3 = net_negative/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_4 = textstat.flesch_reading_ease(' '.join(flslist))
    except:
        pass

    return feature_extract_1, feature_extract_2, feature_extract_3, feature_extract_4, fls_sentiment_list, net_positive, net_negative, net_neutral


In [8]:
# Questions & Answers:
# 23. Whole Q&A - net sentiment "net_sentiment_list"
# 24. Whole Q&A – #of_NETpositive(sentiment)/#of_NETnegative+NETpositive+NETneutral(sentiment)
# 25. Whole Q&A – #of_NETnegative(sentiment)/#of_NETnegative+NETpositive+NETneutral(sentiment)
# 26. Whole Q&A - net word complexity "net_text_complex_list"
# 
# 27. all question (aggregate) - sentiment "question_sentiment_list"
# 28. all question (aggregate) – #of_Qpositive(sentiment)/#of_Qnegative+Qpositive+Qneutral(sentiment)
# 29. all question (aggregate) – #of_Qnegative(sentiment)/#of_Qnegative+Qpositive+Qneutral(sentiment)
# 30. all question (aggregate) - net word complexity "questions_complex_list"
#
# 31. all reply (aggregate) - sentiment "reply_sentiment_list"
# 32. all reply (aggregate) – #of_Rpositive(sentiment)/#of_Rnegative+Rpositive+Rneutral(sentiment)
# 33. all reply (aggregate) – #of_Rnegative(sentiment)/#of_Rnegative+Rpositive+Rneutral(sentiment)
# 34. all reply (aggregate) - net word complexity "reply_complex_list"


# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_SFLSpositive(sentiment)/#of_SFLSnegative+SFLSpositive+SFLSneutral(sentiment)
# 37. Specific foward looking statment - #of_SFLSnegative(sentiment)/#of_SFLSnegative+SFLSpositive+SFLSneutral(sentiment)
#
# 38. Non Specific Forward looking statement - sentiment 
# 39. Non Specific Forward looking statement - #of_NSFLSpositive(sentiment)/#of_NSFLSnegative+NSFLSpositive+NSFLSneutral(sentiment)
# 40. Non Specific Forward looking statement - #of_NSFLSnegative(sentiment)/#of_NSFLSnegative+NSFLSpositive+NSFLSneutral(sentiment)
#
# 41. Not Foward looking statement - sentiment
# 42. Not Foward looking statement - #of_NFLSpositive(sentiment)/#of_NFLSnegative+NFLSpositive+NFLSneutral(sentiment)
# 43. Not Foward looking statement - #of_NFLSnegative(sentiment)/#of_NFLSnegative+NFLSpositive+NFLSneutral(sentiment)


# 1. get rid of the speaker names from each "speech bubble" (i.e. analyst_speech, management_speech)

# 2. FOR loop of each speech bubble
    # 2a. Parse them to sentences:
    # 2b. For each in sentences:
        #- classify the sentences (S-FLS, NS-FLS, N-FLS)
        #- Creates x3 FLS lists 
    # 2c. finds text complexity of each speech bubble
    # 2d. adds text complexity value of each speech buble to net_text_complex_list 

# 3. Goes through each FLS list and find sentiment of each with the following conditions:
    # 3a. if sentence is s_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- SFLSnet_positive, SFLSnet_negative, or SFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "s_fls_sentiment_list"

    # 3b. if sentence is ns_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- NSFLSnet_positive, NSFLSnet_negative, or NSFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "ns_fls_sentiment_list"

    # 3c. if sentence is n_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- NFLSnet_positive, NFLSnet_negative, or NFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "n_fls_sentiment_list"

# 4. finds text complexity of s_FLS, ns_FLS, n_FLS

# 5. get features based on the mean of relevant lists

def getFeature23to43(analyst_speech, management_speech, speaker_names):
    new_speaker_names = [word + ':' for word in speaker_names]
    ac = AhoCorasick(new_speaker_names)

    n_flslist = []
    s_flslist = []
    ns_flslist = []

    questions_complex_list = []
    reply_complex_list = []
    net_text_complex_list  = []

    # list of sentiments for all S_FLS, N_FLS, NS_FLS classes
    s_fls_sentiment_list = []
    n_fls_sentiment_list = []
    ns_fls_sentiment_list = []

    # list of sentiments for all sentences that are identified as a "question"
    question_sentiment_list = []

    # list of sentiments for all sentences that are identified as a "reply"
    reply_sentiment_list = []

    # list of sentiments for all sentences in the Q&A section
    net_sentiment_list = []

    net_positive = 0
    net_negative = 0
    net_neutral = 0

    Qpositive = 0
    Qnegative = 0
    Qneutral = 0

    Rpositive = 0
    Rnegative = 0
    Rneutral = 0

    SFLSpositive = 0
    SFLSnegative = 0
    SFLSneutral = 0

    NSFLSpositive = 0
    NSFLSnegative = 0
    NSFLSneutral = 0

    NFLSpositive = 0
    NFLSnegative = 0
    NFLSneutral = 0 

    feature_extract_23 = 0
    feature_extract_24 = 0
    feature_extract_25 = 0
    feature_extract_26 = 0

    feature_extract_27 = 0
    feature_extract_28 = 0
    feature_extract_29 = 0
    feature_extract_30 = 0

    feature_extract_31 = 0
    feature_extract_32 = 0
    feature_extract_33 = 0
    feature_extract_34 = 0

    feature_extract_35 = 0
    feature_extract_36 = 0
    feature_extract_37 = 0
    
    feature_extract_38 = 0
    feature_extract_39 = 0
    feature_extract_40 = 0
    
    feature_extract_41 = 0
    feature_extract_42 = 0
    feature_extract_43 = 0

    try:
        for speech_bubble in analyst_speech:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                questions_complex_list.append(flesch_score)
                net_text_complex_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    sentiment_result = sentiment_nlp(sentence)
                    sentiment_score, positivity_value = map_sentiments(sentiment_result)
                    question_sentiment_list.append(sentiment_score)
                    net_sentiment_list.append(sentiment_score)

                    if positivity_value == "positive":
                        net_positive += 1
                        Qpositive += 1

                    elif positivity_value == "negative":
                        net_negative += 1
                        Qnegative += 1

                    else:
                        net_neutral += 1
                        Qneutral += 1
            except:
                pass
        
        for speech_bubble in management_speech:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                reply_complex_list.append(flesch_score)
                net_text_complex_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                fls_results = fls_nlp(new_speech_bubble_list)
                
                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    if fls_results[i]['label'] == 'Not FLS':
                        n_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Specific FLS':
                        s_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Non-specific FLS':                    
                        ns_flslist.append(sentence)
                    
            except:
                pass
                
        # for "n_flslist":
        n_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NFLSpositive, NFLSnegative, NFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, n_flslist, n_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NFLSpositive, NFLSnegative, NFLSneutral, net_positive, net_negative, net_neutral)
        
        # for "s_flslist":
        s_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, SFLSpositive, SFLSnegative, SFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, s_flslist, s_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, SFLSpositive, SFLSnegative, SFLSneutral, net_positive, net_negative, net_neutral)
        
        # for "ns_flslist":
        ns_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NSFLSpositive, NSFLSnegative, NSFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, ns_flslist, ns_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NSFLSpositive, NSFLSnegative, NSFLSneutral, net_positive, net_negative, net_neutral)

        feature_extract_23 = statistics.mean(net_sentiment_list)

        try:
            feature_extract_24 = net_positive/(net_positive+net_negative+net_neutral)
            feature_extract_25 = net_negative/(net_positive+net_negative+net_neutral)
        except:
            pass
        try:
            feature_extract_26 = statistics.mean(net_text_complex_list)
        except:
            pass
        try:
            feature_extract_27 = statistics.mean(question_sentiment_list)
        except:
            pass
        try:
            feature_extract_28 = Qpositive/(Qpositive+Qnegative+Qneutral)
            feature_extract_29 = Qnegative/(Qpositive+Qnegative+Qneutral)
        except:
            pass
        try:
            feature_extract_30 = statistics.mean(questions_complex_list)
        except:
            pass
        try:
            feature_extract_31 = statistics.mean(reply_sentiment_list)
        except:
            pass
        try:
            feature_extract_32 = Rpositive/(Rpositive+Rnegative+Rneutral)
            feature_extract_33 = Rnegative/(Rpositive+Rnegative+Rneutral)
        except:
            pass
        try:
            feature_extract_34 = statistics.mean(reply_complex_list)
        except:
            pass
        try:
            feature_extract_35 = statistics.mean(s_fls_sentiment_list)
        except:
            pass
        try:
            feature_extract_36 = SFLSpositive/(SFLSpositive+SFLSnegative+SFLSneutral)
            feature_extract_37 = SFLSnegative/(SFLSpositive+SFLSnegative+SFLSneutral)
        except:
            pass
        
        try:
            feature_extract_38 = statistics.mean(ns_fls_sentiment_list)
        except:
            pass
        try:
            feature_extract_39 = NSFLSpositive/(NSFLSpositive+NSFLSnegative+NSFLSneutral)
            feature_extract_40 = NSFLSnegative/(NSFLSpositive+NSFLSnegative+NSFLSneutral)
        except:
            pass
        
        try:
            feature_extract_41 = statistics.mean(n_fls_sentiment_list)
        except:
            pass
        try:
            feature_extract_42 = NFLSpositive/(NFLSpositive+NFLSnegative+NFLSneutral)
            feature_extract_43 = NFLSnegative/(NFLSpositive+NFLSnegative+NFLSneutral)
        except:
            pass

    except:
        pass
    return [feature_extract_23, feature_extract_24, feature_extract_25, feature_extract_26, feature_extract_27, feature_extract_28, feature_extract_29, feature_extract_30, feature_extract_31, feature_extract_32, feature_extract_33, feature_extract_34, feature_extract_35, feature_extract_36, feature_extract_37, feature_extract_38, feature_extract_39, feature_extract_40, feature_extract_41, feature_extract_42, feature_extract_43]

def get_SentimentLists_from_FLS(net_sentiment_list, THIS_flslist, THISfls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, FLSpositive, FLSnegative, FLSneutral, net_positive, net_negative, net_neutral):
    for each_fls_sentence in THIS_flslist:
        sentiment_result = sentiment_nlp(each_fls_sentence)
        sentiment_score, positivity_value = map_sentiments(sentiment_result)
        THISfls_sentiment_list.append(sentiment_score)
        reply_sentiment_list.append(sentiment_score)
        net_sentiment_list.append(sentiment_score)
        
        if positivity_value == "positive":
            net_positive += 1
            FLSpositive += 1
            Rpositive += 1
        elif positivity_value == "negative":
            net_negative += 1
            FLSnegative += 1
            Rnegative += 1

        else:
            net_neutral += 1
            FLSneutral += 1
            Rneutral += 1

    return THISfls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, FLSpositive, FLSnegative, FLSneutral, net_positive, net_negative, net_neutral


In [9]:
# With specific words analysis:
# Sentences that includes the word:
# all of these words can be plural (e.g. cost and costs)
# 44: "margin" - average sentiment
# 45: "margin" - pos/total sentiment
# 46: "margin" - neg/total sentiment

# 47: "cost" - average sentiment
# 48: "cost" - pos/total sentiment
# 49: "cost" - neg/total sentiment

# 50: "revenue" - average sentiment
# 51: "revenue" - pos/total sentiment
# 52: "revenue" - neg/total sentiment

# 53: "earnings or EBIDTA" - average sentiment
# 54: "earnings or EBIDTA" - pos/total sentiment
# 55: "earnings or EBIDTA" - neg/total sentiment

# 56: "growth" - average sentiment
# 57: "growth" - pos/total sentiment
# 58: "growth" - neg/total sentiment

# 59: "leverage or debt" -  average sentiment
# 60: "leverage or debt" -  pos/total sentiment
# 61: "leverage or debt" -  neg/total sentiment

# 62: "industry or sector" – average sentiment
# 63: "industry or sector" – pos/total sentiment
# 64: "industry or sector" – neg/total sentiment

# 65: "operation" - average sentiment 
# 66: "operation" - pos/total sentiment
# 67: "operation" - neg/total sentiment

# 68: "cashflow" - average sentiment 
# 69: "cashflow" - pos/total sentiment
# 70: "cashflow" - neg/total sentiment

# 71: "dividend/share buyback" - average sentiment 
# 72: "dividend/share buyback" - pos/total sentiment
# 73: "dividend/share buyback" - neg/total sentiment


def deepCleanTranscript(mytranscript, speaker_names):
    updatedTranscript = ' '.join(mytranscript)
    new_speaker_names = [word + ':' for word in speaker_names]
    ac = AhoCorasick(new_speaker_names)

    updatedTranscript = ac.remove_words(updatedTranscript)
 
    if updatedTranscript[0] == " ": 
        updatedTranscript = updatedTranscript.replace(" ", "", 1)

    updatedTranscript = re.sub('[^\S\n]+', ' ', updatedTranscript)
    updatedTranscript.lower()

    return updatedTranscript

def getFeature44to73(mytranscript, speaker_names):
    marginSentimentList = []
    mar_positive = 0
    mar_negative = 0
    mar_neutral = 0

    costSentimentList = []
    cost_positive = 0
    cost_negative = 0
    cost_neutral = 0

    revenueSentimentList = []
    rev_positive = 0
    rev_negative = 0
    rev_neutral = 0

    earningsEBIDTASentimentList = []
    ear_positive = 0
    ear_negative = 0
    ear_neutral = 0
    
    growthSentimentList = []
    gro_positive = 0
    gro_negative = 0
    gro_neutral = 0

    leverageDebtSentimentList = []
    lev_positive = 0
    lev_negative = 0
    lev_neutral = 0

    IndSentimentList = []
    ind_positive = 0
    ind_negative = 0
    ind_neutral = 0

    operationSentimentList = []
    ope_positive = 0
    ope_negative = 0
    ope_neutral = 0

    cashflowSentimentList = []
    cash_positive = 0
    cash_negative = 0
    cash_neutral = 0

    dividendSentimentList = []
    div_positive = 0
    div_negative = 0
    div_neutral = 0

    feature_extract_44 = 0
    feature_extract_45 = 0
    feature_extract_46 = 0
    feature_extract_47 = 0
    feature_extract_48 = 0
    feature_extract_49 = 0
    feature_extract_50 = 0
    feature_extract_51 = 0
    feature_extract_52 = 0
    feature_extract_53 = 0
    feature_extract_54 = 0
    feature_extract_55 = 0
    feature_extract_56 = 0
    feature_extract_57 = 0
    feature_extract_58 = 0
    feature_extract_59 = 0
    feature_extract_60 = 0
    feautre_extract_61 = 0
    feature_extract_62 = 0
    feature_extract_63 = 0
    feature_extract_64 = 0
    feature_extract_65 = 0
    feature_extract_66 = 0
    feature_extract_67 = 0
    feature_extract_68 = 0
    feature_extract_69 = 0 
    feature_extract_70 = 0
    feature_extract_71 = 0
    feature_extract_72 = 0
    feature_extract_73 = 0

    updatedTranscript = deepCleanTranscript(mytranscript, speaker_names)
    
    updatedTranscriptList = split_paragraph_into_sentences(updatedTranscript)

    for mysentence in updatedTranscriptList:
        if (" margin" in mysentence) or (" return" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            marginSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                mar_positive += 1

            elif positivity_value == "negative":
                mar_negative += 1

            else:
                mar_neutral += 1
            

        if " cost" in mysentence:
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            costSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                cost_positive += 1

            elif positivity_value == "negative":
                cost_negative += 1

            else:
                cost_neutral += 1

        if (" revenue" in mysentence) or (" top line" in mysentence) or (" sales" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            revenueSentimentList.append(sentiment_score)

            if positivity_value == "positive":
                rev_positive += 1

            elif positivity_value == "negative":
                rev_negative += 1

            else:
                rev_neutral += 1

        if (" earning" in mysentence) or (" EBIT" in mysentence) or (" profit" in mysentence) or (" bottom line" in mysentence) or (" net income" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            earningsEBIDTASentimentList.append(sentiment_score)
            if positivity_value == "positive":
                ear_positive += 1

            elif positivity_value == "negative":
                ear_negative += 1

            else:
                ear_neutral += 1

        if (" growth" in mysentence) or (" organic" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            growthSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                gro_positive += 1

            elif positivity_value == "negative":
                gro_negative += 1

            else:
                gro_neutral += 1

        if (" leverage" in mysentence) or (" debt" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            leverageDebtSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                lev_positive += 1

            elif positivity_value == "negative":
                lev_negative += 1

            else:
                lev_neutral += 1

        if (" industry" in mysentence) or (" industr" in mysentence) or (" economy" in mysentence) or (" economi”" in mysentence) or (" sector" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            IndSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                ind_positive += 1

            elif positivity_value == "negative":
                ind_negative += 1

            else:
                ind_neutral += 1

        if " operation" in mysentence:
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            operationSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                ope_positive += 1

            elif positivity_value == "negative":
                ope_negative += 1

            else:
                ope_neutral += 1
        
        if (" cashflow" in mysentence) or (" cash flow" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            cashflowSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                cash_positive += 1

            elif positivity_value == "negative":
                cash_negative += 1

            else:
                cash_neutral += 1

        if (" dividend" in mysentence) or (" buyback" in mysentence) or (" repurchase" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            dividendSentimentList.append(sentiment_score)
            if positivity_value == "positive":
                div_positive += 1

            elif positivity_value == "negative":
                div_negative += 1

            else:
                div_neutral += 1

    try:
        feature_extract_44 = statistics.mean(marginSentimentList)
    except:
        pass
    try:
        feature_extract_45 = mar_positive/(mar_positive+mar_negative+mar_neutral)
        feature_extract_46 = mar_negative/(mar_positive+mar_negative+mar_neutral)
    except:
        pass
    try:
        feature_extract_47 = statistics.mean(costSentimentList)
    except:
        pass
    try:
        feature_extract_48 = cost_positive/(cost_positive+cost_negative+cost_neutral)
        feature_extract_49 = cost_negative/(cost_positive+cost_negative+cost_neutral)
    except:
        pass
    try:
        feature_extract_50 = statistics.mean(revenueSentimentList)
    except:
        pass
    try:
        feature_extract_51 = rev_positive/(rev_positive+rev_negative+rev_neutral)
        feature_extract_52 = rev_negative/(rev_positive+rev_negative+rev_neutral)
    except:
        pass
    try:
        feature_extract_53 = statistics.mean(earningsEBIDTASentimentList)
    except:
        pass
    try:
        feature_extract_54 = ear_positive/(ear_positive+ear_negative+ear_neutral)
        feature_extract_55 = ear_negative/(ear_positive+ear_negative+ear_neutral)
    except:
        pass

    try:
        feature_extract_56 = statistics.mean(growthSentimentList)
    except:
        pass
    try:
        feature_extract_57 = gro_positive/(gro_positive+gro_negative+gro_neutral)
        feature_extract_58 = gro_negative/(gro_positive+gro_negative+gro_neutral)
    except:
        pass

    try:
        feature_extract_59 = statistics.mean(leverageDebtSentimentList)
    except:
        pass
    try:
        feature_extract_60 = lev_positive/(lev_positive+lev_negative+lev_neutral)
        feature_extract_61 = lev_negative/(lev_positive+lev_negative+lev_neutral)
    except:
        pass

    try:
        feature_extract_62 = statistics.mean(IndSentimentList)
    except:
        pass
    try:
        feature_extract_63 = ind_positive/(ind_positive+ind_negative+ind_neutral)
        feature_extract_64 = ind_negative/(ind_positive+ind_negative+ind_neutral)
    except:
        pass

    try:
        feature_extract_65 = statistics.mean(operationSentimentList)
    except:
        pass
    try:
        feature_extract_66 = ope_positive/(ope_positive+ope_negative+ope_neutral)
        feature_extract_67 = ope_negative/(ope_positive+ope_negative+ope_neutral)

    except:
        pass

    try:
        feature_extract_68 = statistics.mean(cashflowSentimentList)
    except:
        pass
    try:
        feature_extract_69 = cash_positive/(cash_positive+cash_negative+cash_neutral)
        feature_extract_70 = cash_negative/(cash_positive+cash_negative+cash_neutral)

    except:
        pass

    try:
        feature_extract_71 = statistics.mean(dividendSentimentList)
    except:
        pass
    try:
        feature_extract_72 = div_positive/(div_positive+div_negative+div_neutral)
        feature_extract_73 = div_negative/(div_positive+div_negative+div_neutral)
    except:
        pass

    return [feature_extract_44, feature_extract_45, feature_extract_46, feature_extract_47, feature_extract_48, feature_extract_49, feature_extract_50, feature_extract_51, feature_extract_52, feature_extract_53, feature_extract_54, feature_extract_55, feature_extract_56, feature_extract_57, feature_extract_58, feature_extract_59, feature_extract_60, feautre_extract_61, feature_extract_62, feature_extract_63, feature_extract_64, feature_extract_65, feature_extract_66, feature_extract_67, feature_extract_68, feature_extract_69, feature_extract_70, feature_extract_71, feature_extract_72, feature_extract_73]

In [None]:
#testing TFIDF

sector = "tech"
stock = "AAPL"

rolling_path_of_four_transcript = [
    'sectors/tech/AAPL/AAPL20224.csv', 
    'sectors/tech/AAPL/AAPL20223.csv',
    'sectors/tech/AAPL/AAPL20222.csv',
    'sectors/tech/AAPL/AAPL20221.csv'
    ]

WHOLE_rolling_frame_of_four_transcripts = []
PRERELEASE_rolling_frame_of_four_transcripts = []
MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts = []
ANALYST_SENTENCES_rolling_frame_of_four_transcripts = []


for path in rolling_path_of_four_transcript:
    wholeTranscript = get_transcript(path)
    transcript_safe_harbour, transcript_questions = split_transcript(wholeTranscript)
    speaker_names = get_file_speaker_names(sector, stock)
    analyst_names = find_analyst_names(speaker_names, transcript_questions)

    analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

    wholeTranscript = deepCleanTranscript(wholeTranscript, speaker_names)
    transcript_safe_harbour = deepCleanTranscript(transcript_safe_harbour, speaker_names)
    management_sentences = deepCleanTranscript(management_sentences, speaker_names)
    analyst_sentences = deepCleanTranscript(analyst_sentences, speaker_names)

    WHOLE_rolling_frame_of_four_transcripts.append(wholeTranscript)
    PRERELEASE_rolling_frame_of_four_transcripts.append(transcript_safe_harbour)
    MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts.append(management_sentences)
    ANALYST_SENTENCES_rolling_frame_of_four_transcripts.append(analyst_sentences)

In [10]:
custom_stop_words = ['thanks', 'thank', 'really', 'said', 'say', 'yes', 'no', 've', 'll', 'don']
all_stop_words = list(ENGLISH_STOP_WORDS) + custom_stop_words

def tf_idf(transcript):
    #1. Removes stop words, 2. finds tf.idf value, used as a weight
    vectoriser = TfidfVectorizer(
        lowercase=True,
        max_features=100,
        ngram_range=(1, 3), # 1 to trigram as they are all common in finance (i.e. earnings per share, free cash flow etc.)
        stop_words=all_stop_words # removes stop words (i.e. irrevelant day to day words)
    )
    #vectorises tfidf values into a vector
    tfidf_vec = vectoriser.fit_transform(transcript)
    
    feature_names = vectoriser.get_feature_names()
    for i, value in enumerate(tfidf_vec[0].toarray()[0]):
        if value > 0:
            print(f"{feature_names[i]}:{value}")

    return tfidf_vec


In [None]:
value = tf_idf(MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts)
print(value)

In [None]:
cos = linear_kernel(value) # finds the cosine similarity matrix
print(cos) # cosine similarity matrix

print(cos[0,1]) #(compares the first transcript with the second transcript)
print(cos[0,2]) #(compares the first transcript with the third transcript)
print(cos[0,3]) #(compares the first transcript with the fourth transcript)

In [11]:
# 74: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 2nd transcript
# 75: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 3rd transcript
# 76: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 4th transcript
# 77: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 2nd transcript
# 78: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 3rd transcript
# 79: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 4th transcript
# 80: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 81: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 82: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
# 83: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 84: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 85: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript


def find_cosineSimilarity(thistranscript):
    tf_idf_value = tf_idf(thistranscript)
    cosineMatrix = linear_kernel(tf_idf_value) # finds the cosine similarity matrix

    cos1_2 = cosineMatrix[0,1] # (compares the first transcript with the second transcript)
    cos1_3 = cosineMatrix[0,2] # (compares the first transcript with the third transcript)
    cos1_4 = cosineMatrix[0,3] # (compares the first transcript with the fourth transcript)

    return cos1_2, cos1_3, cos1_4

def tf_idf(transcript):
    custom_stop_words = ['thanks', 'thank', 'really', 'said', 'say', 'yes', 'no', 've', 'll', 'don']
    all_stop_words = list(ENGLISH_STOP_WORDS) + custom_stop_words

    vectoriser = TfidfVectorizer(
        lowercase=True,
        max_features=100,
        ngram_range=(1, 3), # 1 to trigram as they are all common in finance (i.e. earnings per share, free cash flow etc.)
        stop_words=all_stop_words # removes stop words (i.e. irrevelant day to day words)
    )

    #vectorises tfidf values into a vector
    tfidf_vec = vectoriser.fit_transform(transcript)

    return tfidf_vec
    
def getFeature74to85(sector, stock, rolling_path_of_four_transcript):

    WHOLE_rolling_frame_of_four_transcripts = []
    PRERELEASE_rolling_frame_of_four_transcripts = []
    MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts = []
    ANALYST_SENTENCES_rolling_frame_of_four_transcripts = []


    for path in rolling_path_of_four_transcript:
        wholeTranscript = get_transcript(path)
        transcript_safe_harbour, transcript_questions = split_transcript(wholeTranscript)
        speaker_names = get_file_speaker_names(sector, stock)
        analyst_names = find_analyst_names(speaker_names, transcript_questions)
        
        analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

        wholeTranscript = deepCleanTranscript(wholeTranscript, speaker_names)
        transcript_safe_harbour = deepCleanTranscript(transcript_safe_harbour, speaker_names)
        management_sentences = deepCleanTranscript(management_sentences, speaker_names)
        analyst_sentences = deepCleanTranscript(analyst_sentences, speaker_names)

        WHOLE_rolling_frame_of_four_transcripts.append(wholeTranscript)
        PRERELEASE_rolling_frame_of_four_transcripts.append(transcript_safe_harbour)
        MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts.append(management_sentences)
        ANALYST_SENTENCES_rolling_frame_of_four_transcripts.append(analyst_sentences)

    feature_extract_74, feature_extract_75, feature_extract_76 = find_cosineSimilarity(WHOLE_rolling_frame_of_four_transcripts)

    feature_extract_77, feature_extract_78, feature_extract_79 = find_cosineSimilarity(PRERELEASE_rolling_frame_of_four_transcripts)

    feature_extract_80, feature_extract_81, feature_extract_82 = find_cosineSimilarity(MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts)

    feature_extract_83, feature_extract_84, feature_extract_85 = find_cosineSimilarity(ANALYST_SENTENCES_rolling_frame_of_four_transcripts)
    
    
    return [feature_extract_74, feature_extract_75, feature_extract_76, feature_extract_77, feature_extract_78, feature_extract_79, feature_extract_80, feature_extract_81, feature_extract_82, feature_extract_83, feature_extract_84, feature_extract_85]



In [None]:
fea_ext_list74to85 = getFeature74to85(sector, stock, rolling_path_of_four_transcript)
fea_ext_list74to85

In [None]:
sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[0-9]**[0-9]*[1-4].*')
sector_files.sort(reverse=True)

for i in range (0, len(sector_files)):
    if i < len(sector_files)-3:
        rolling_path_of_four_transcript = sector_files[i:i+4]
        print(rolling_path_of_four_transcript)
        fea_ext_list52to63 = getFeature52to63(sector, stock, rolling_path_of_four_transcript)
        print(fea_ext_list52to63)
        print("________________________________")
    else:
        print(sector_files[i])

In [None]:
analyst_names

In [None]:
def find_analyst_names(speaker_names, transcript_questions):
    analyst_names = []
    # the programme recognises the question is being asked by an analyst when the following conditions are met:
    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
        if "operator:" in speech_bubble:
            for name in speaker_names:
                namelist = name.split()
                if name.lower() != "operator": 
                    for name_2 in namelist: # cycle through each name in the name_list
                        name_2 = name_2.lower()
                        # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
                        # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
                        # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
                        if (((" "+name_2+" " in speech_bubble) and len(name_2) > 2) and (("end" not in speech_bubble) and ("closing" not in speech_bubble) and (("turn" not in speech_bubble) or (("over" not in speech_bubble))))) and (name_2 in transcript_questions[index+1].lower()):
                            print(name)
                            print(speech_bubble) 
                            analyst_names.append(name)
                    if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
                        analyst_names.append(name) 
                        
    analyst_names = list(set(analyst_names)) # replaces duplicates        
    return analyst_names

In [None]:
def split_transcript(mytranscript):    
    transcript_safe_harbour, transcript_questions = "", ""
    for i in range(0, len(mytranscript)):
        speech_bubble = mytranscript[i].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space so the IF condition below can run smoothly
        # finds the following condition (what operator says) and splits the transcript into 2)
        if (i > 0) and (("operator:" in speech_bubble) and (("question" in speech_bubble) or ("go ahead" in speech_bubble) or ("operator instructions" in speech_bubble))):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break
        elif (i > 0 ) and ("operator" in speech_bubble) and ("question" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i+1]
            transcript_questions = mytranscript[i+1:]
            break
        elif (i > 0 ) and ("operator:" in speech_bubble) and ("first" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break

        elif (i >0) and ("first" in speech_bubble) and ("question" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break


    return transcript_safe_harbour, transcript_questions


In [None]:
analyst_names

In [None]:
def split_by_colon(string):
    lines = string.split("\n")
    result = []
    for line in lines:
        if ":" in line:
            name_start3 = line.rfind(".", 0, line.index(":"))
            name_start2 = line.rfind("?", 0, line.index(":"))
            name_start = max(name_start2, name_start3)
            if name_start == -1:
                name_start = 0
            else:
                name_start += 1
            name = line[:string.index(":")].strip()
            result.append(name + ": " + line[line.index(":") + 1:].strip())

            print(result)
    return "\n".join(result)

input_string = """
Operator: Good day, ladies and gentlemen. My name is Sedaris, a Lynn? Lynn Antipas Tyson: Thank you, Sedaris.
"""
output_string = split_by_colon(input_string)
print(output_string)

In [None]:
# TESTING DATA FOR FORD, entire transcript is one string rather than being seperated by \n

def split_by_colon(string):
    result = []
    while ":" in string:
        name_start3 = string.rfind(".", 0, string.index(":"))
        name_start2 = string.rfind("?", 0, string.index(":"))
        name_start = max(name_start2, name_start3)
        if name_start == -1:
            name_start = 0
        else:
            name_start += 1
        name = string[name_start:string.index(":")].strip()
        result.append(name + ": " + string[string.index(":") + 1:].strip())
        string = string[string.index(":") + 1:]
    return (result)

sector = "automobiles"
stock = "F"

path = 'sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20193.csv'
input_string = pd.read_csv(path).iloc[[2]].values[0][0] 

output_list = split_by_colon(input_string)

result = []
for i in range(0, len(output_list)-1):
    currentstring = output_list[i]
    nextstring = output_list[i+1]
    '\n'.join(currentstring)
    index = currentstring.find(nextstring)
    if index != -1:
        currentstring = currentstring[:index]
    result.append(currentstring)
result.append(nextstring)

mystr = "\n".join(result)

with open(path, "r") as f:
    reader = csv.reader(f)
    data = [row for row in reader]

data[3] = [mystr]

with open(path, "w", newline='') as f:
    writer = csv.writer(f)
    writer.writerows(data)

In [None]:
def get_analyst_management_sentences(analyst_names, transcript_questions):
    analyst_sentences = []
    management_sentences = []

    # get analyst sentence
    for index in range(0, len(transcript_questions)):
        speech_bubble = transcript_questions[index]

        #finds the name of the speaker
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        for name in analyst_names:
            namelist = name.split()
            if (speech_bubble not in analyst_sentences):
                # checks if the name of the current speaker is in the "analyst_names" list, if it is, then this speaker is considered as an analyst
                if speaker_name in analyst_names:
                    analyst_sentences.append(speech_bubble)

                elif (speaker_name.lower() == "operator") or (speaker_name.lower() == "operator "):
                    pass

                elif ((namelist[0] in speaker_name) or (namelist[-1] in speaker_name)) and (("operator:" in transcript_questions[index-1].lower()) or ("operator :" in transcript_questions[index-1].lower())):
                    # in the case where:
                    # Operator: [Operator Instructions]Our first question is from Jeffrey Van Sinderen with B. Riley FBR. Please proceed.
                    # JeffreySinderen: Good morning, everyone. Can you speak a little bit more about the sales progression you've seen in China...
                    # Jeffrey Van Sindere is an analyst, but is referenced as JeffreySinderen in the text, the previous find_analyst_names() function did not pick up this
                    # However, now the name "JeffreySinderen" is registered as an analyst name through this function.
                    analyst_names.append(speaker_name)
                    analyst_sentences.append(speech_bubble)

    # get management sentence
    for index in range(0, len(transcript_questions)):
        speech_bubble = transcript_questions[index]
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        
        # dont want operator's sentence 
        if (speaker_name.lower() == "operator") or (speaker_name.lower() == "operator "):
            pass
        
        elif (speech_bubble not in analyst_sentences):
            management_sentences.append(speech_bubble)
    return analyst_sentences, management_sentences

In [None]:
transcript_questions

In [None]:
sector = "automobiles"
stock = "NIU"
csv.field_size_limit(sys.maxsize)


path = 'sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20222.csv'
with open(path, 'r') as file:
    reader = csv.reader(file)
    lengthofList = len(list(reader))
wholeTranscript = get_transcript(path)
transcript_safe_harbour, transcript_questions = split_transcript(wholeTranscript)
speaker_names = get_file_speaker_names(sector, stock)
analyst_names = find_analyst_names(speaker_names, transcript_questions)

analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)
analyst_names
# wholeTranscript = deepCleanTranscript(wholeTranscript, speaker_names)
# transcript_safe_harbour = deepCleanTranscript(transcript_safe_harbour, speaker_names)
# management_sentences = deepCleanTranscript(management_sentences, speaker_names)
# analyst_sentences = deepCleanTranscript(analyst_sentences, speaker_names)


In [None]:
# def find_analyst_names(speaker_names, transcript_questions):
#     i = 0
#     analyst_names = []
#     analyst_names.append("A Martin Viecha")
#     analyst_names.append("Martin Viecha")
#     # the programme recognises the question is being asked by an analyst when the following conditions are met:
#     for index in range(0, len(transcript_questions)-2):
#         speech_bubble = transcript_questions[index].lower()
#         speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
#         if ("martin viecha:" in speech_bubble) or ("martin viecha :" in speech_bubble) or ("operator:" in speech_bubble) or ("operator :" in speech_bubble):
#             i+=1
#             if i > 3:
#                 for name in speaker_names:
#                     namelist = name.split()
#                     if (name.lower() != "operator") and ("elon" not in name.lower()) and ("representative" not in name.lower()) and ("corporate" not in name.lower()) and ("company" not in name.lower()):
#                         for name_2 in namelist: # cycle through each name in the name_list
#                             name_2 = name_2.lower()
#                             # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
#                             # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
#                             # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
#                             if (((" "+name_2+" " in speech_bubble) and len(name_2) > 2) and (("end" not in speech_bubble) and ("closing" not in speech_bubble) and (("turn" not in speech_bubble) or (("over" not in speech_bubble))))) and (name_2 in transcript_questions[index+1].lower()):
#                                 analyst_names.append(name)
#                         if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
#                             analyst_names.append(name) 
                        
#     analyst_names = list(set(analyst_names)) # replaces duplicates        
#     return analyst_names

In [None]:
sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[0-9]**[0-9]*[1-4].*')
sector_files.sort(reverse=True)

for i in range (0, len(sector_files)-4): # for every .csv path of that stock
    path = sector_files[i]
    print(path)
    with open(path, 'r') as file:
        reader = csv.reader(file)
        lengthofList = len(list(reader))
    if lengthofList == 5:
        mytranscript = get_transcript(path)
        transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
        speaker_names = get_file_speaker_names(sector, stock)
        analyst_names = find_analyst_names(speaker_names, transcript_questions)
        analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)
        fea_ext_list5to22 = getFeature5to22(transcript_safe_harbour, speaker_names)

        fea_ext_list23to43 = getFeature23to43(analyst_sentences, management_sentences, speaker_names)
        fea_ext_list44to73 = getFeature44to73(mytranscript, speaker_names)

        if i < len(sector_files)-3:
            rolling_path_of_four_transcript = sector_files[i:i+4]
            fea_ext_list74to85 = getFeature74to85(sector, stock, rolling_path_of_four_transcript)
        else:
            fea_ext_list74to85 = [None]*12

        stock_return_list = get_stock_returns(path, stock)

        list_add_to_csv = fea_ext_list5to22 + fea_ext_list23to43 + fea_ext_list44to73+ fea_ext_list74to85 + stock_return_list

    #if there are 5 items in the list
        with open(path, 'a', newline='') as file:
            writer = csv.writer(file)
            for item in list_add_to_csv:
                writer.writerow([item])

In [12]:
# 86: Stock price difference between Day 0 (Day earnings call is released) and Day 10
# 87: Stock price difference between Day 0 (Day earnings call is released) and Day 30
# 88: Stock price difference between Day 0 (Day earnings call is released) and Day 50
# 89: Stock price difference between Day 0 (Day earnings call is released) and Day 70
# 90: Stock price difference between Day 0 (Day earnings call is released) and Day 90
#
# 91: Stock price difference between Day 1 (Day earnings call is released) and Day 10
# 92: Stock price difference between Day 1 (Day earnings call is released) and Day 30
# 93: Stock price difference between Day 1 (Day earnings call is released) and Day 50
# 94: Stock price difference between Day 1 (Day earnings call is released) and Day 70
# 95: Stock price difference between Day 1 (Day earnings call is released) and Day 90

from datetime import timedelta

def find_percentage_change(hist):
    try:
        percentage_change = ((hist['Open'][-1])/(hist['Open'][0])-1) # % change in price of day X to day X+Y
    except:
        return None
        
    return percentage_change

def get_stock_returns(path, stock):
    date1 = pd.read_csv(path).iloc[[1]].values[0][0] 
    date1 = datetime.strptime(date1, '%Y-%m-%d %H:%M:%S').date()

    yfTicker = yf.Ticker(stock)
    TickerHistory = yfTicker.history

    hist1 = TickerHistory(start=date1, end=date1 + timedelta(days=10))  # day 0 to day 10
    hist2 = TickerHistory(start=date1, end=date1 + timedelta(days=30))   # day 0 to day 30
    hist3 = TickerHistory(start=date1, end=date1 + timedelta(days=50))
    hist4 = TickerHistory(start=date1, end=date1 + timedelta(days=70))      
    hist5 = TickerHistory(start=date1, end=date1 + timedelta(days=90))   

    hist6 = TickerHistory(start=date1 + timedelta(days=1), end=date1 + timedelta(days=10))   #day 1 to day 11
    hist7 = TickerHistory(start=date1 + timedelta(days=1), end=date1 + timedelta(days=30))   #day 1 to day 31
    hist8 = TickerHistory(start=date1 + timedelta(days=1), end=date1 + timedelta(days=50))   
    hist9 = TickerHistory(start=date1 + timedelta(days=1), end=date1 + timedelta(days=70))
    hist10 = TickerHistory(start=date1 + timedelta(days=1), end=date1 + timedelta(days=90))

    percentage_change1 = find_percentage_change(hist1)
    percentage_change2 = find_percentage_change(hist2)
    percentage_change3 = find_percentage_change(hist3)
    percentage_change4 = find_percentage_change(hist4)
    percentage_change5 = find_percentage_change(hist5)
    percentage_change6 = find_percentage_change(hist6)
    percentage_change7 = find_percentage_change(hist7)
    percentage_change8 = find_percentage_change(hist8)
    percentage_change9 = find_percentage_change(hist9)
    percentage_change10 = find_percentage_change(hist10)
            
    return [percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10]



In [14]:
sectorlist = ["banks", "consumer-retailing", "tech", "capital-goods", "commercial-services", 
"consumer-durables", "consumer-services", "diversified-financials",
"energy", "food-beverage-tobacco", "healthcare", "household", "insurance", "materials", "media", 
"pharmaceuticals-biotech", "real-estate", "retail", "semiconductors", "software", "telecom", "transportation", "utilities"]

In [24]:
csv.field_size_limit(sys.maxsize)

for sector in sectorlist:
    print(sector, "sector start ................................................................")
    filelist = os.listdir("sectors/"+sector)
    try:
        filelist.remove('.DS_Store')
    except:
        pass
    for stock in filelist:
        print(stock)
        sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[0-9]**[0-9]*[1-4].*')
        sector_files.sort(reverse=True)
        if stock not in ["UWMC", "FBC", "TBK", "BKU"]:
            ticker = yf.Ticker(stock)
            market_cap = ticker.fast_info['market_cap']
            for i in range (0, len(sector_files)): # for every .csv path of that stock
                path = sector_files[i]
                with open(path, 'r') as file:
                    reader = csv.reader(file)
                    lengthofList = len(list(reader))
                if lengthofList == 5:
                    mytranscript = get_transcript(path)
                    transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
                    speaker_names = get_file_speaker_names(sector, stock)
                    analyst_names = find_analyst_names(speaker_names, transcript_questions)
                            
                    analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

                    fea_ext_list5to22 = getFeature5to22(transcript_safe_harbour, speaker_names)

                    fea_ext_list23to43 = getFeature23to43(analyst_sentences, management_sentences, speaker_names)

                    fea_ext_list44to73 = getFeature44to73(mytranscript, speaker_names)

                    if i < len(sector_files)-3:
                        rolling_path_of_four_transcript = sector_files[i:i+4]
                        fea_ext_list74to85 = getFeature74to85(sector, stock, rolling_path_of_four_transcript)
                    else:
                        fea_ext_list74to85 = [None]*12

                    stock_return_list = get_stock_returns(path, stock)

                    list_add_to_csv = fea_ext_list5to22 + fea_ext_list23to43 + fea_ext_list44to73+ fea_ext_list74to85 + stock_return_list + [market_cap]
                    

                #if there are 5 items in the list
                    with open(path, 'a', newline='') as file:
                        writer = csv.writer(file)
                        for item in list_add_to_csv:
                            writer.writerow([item])
                
    break




banks sector start ................................................................
PEBO
TFC
UMPQ
CBU
EWBC
NBHC
HOMB
FITB
EBC
HTLF
PB
PFC
SSB


In [None]:
# List of .csv file info for each stock:
# ------------------------------------------------------------------------------------------------
# META DATA
# ------------------------------------------------------------------------------------------------
#
# 0. Year of transcript release
# 1. Quarter of transcript release
# 2. Date of transcript release
# 3. Earnings Transcript contents
#
# ------------------------------------------------------------------------------------------------
# Feature Extractions:
# ------------------------------------------------------------------------------------------------
#
# 4. EPS surprise value
#
# ------------------------------------------------------------------------------------------------
# Transcript Features:
# ------------------------------------------------------------------------------------------------
#
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific
#
# ------------------------------------------------------------------------------------------------
#
# Questions & Answers:
# 23. Whole Q&A - net sentiment
# 24. Whole Q&A – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 25. Whole Q&A – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 26. Whole Q&A - net word complexity
# 
# 27. all question (aggregate) - sentiment
# 28. all question (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 29. all question (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 30. all question (aggregate) - word complex
#
# 31. all reply (aggregate) - sentiment
# 32. all reply (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 33. all reply (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 34. all reply (aggregate) - word complex
#
# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 37. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 38. Specific foward looking statment - word complexity
#
# 39. Non Specific Forward looking statement - sentiment 
# 40. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 41. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 42. Non Specific Forward looking statement - word complexity
#
# 43. Not Foward looking statement - sentiment
# 44. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 45. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 46. Not Foward looking statement - word complexity
#
# ------------------------------------------------------------------------------------------------
#
# With specific words analysis:
# Sentences that includes the word:
# all of these words can be plural (e.g. cost and costs)
# 44: "margin" - average sentiment
# 45: "margin" - pos/total sentiment
# 46: "margin" - neg/total sentiment

# 47: "cost" - average sentiment
# 48: "cost" - pos/total sentiment
# 49: "cost" - neg/total sentiment

# 50: "revenue" - average sentiment
# 51: "revenue" - pos/total sentiment
# 52: "revenue" - neg/total sentiment

# 53: "earnings or EBIDTA" - average sentiment
# 54: "earnings or EBIDTA" - pos/total sentiment
# 55: "earnings or EBIDTA" - neg/total sentiment

# 56: "growth" - average sentiment
# 57: "growth" - pos/total sentiment
# 58: "growth" - neg/total sentiment

# 59: "leverage or debt" -  average sentiment
# 60: "leverage or debt" -  pos/total sentiment
# 61: "leverage or debt" -  neg/total sentiment

# 62: "industry or sector" – average sentiment
# 63: "industry or sector" – pos/total sentiment
# 64: "industry or sector" – neg/total sentiment

# 65: "operation" - average sentiment 
# 66: "operation" - pos/total sentiment
# 67: "operation" - neg/total sentiment

# 68: "cashflow" - average sentiment 
# 69: "cashflow" - pos/total sentiment
# 70: "cashflow" - neg/total sentiment

# 71: "dividend/share buyback" - average sentiment 
# 72: "dividend/share buyback" - pos/total sentiment
# 73: "dividend/share buyback" - neg/total sentiment

# ------------------------------------------------------------------------------------------------
#
# 74: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 2nd transcript
# 75: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 3rd transcript
# 76: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 4th transcript
# 77: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 2nd transcript
# 78: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 3rd transcript
# 79: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 4th transcript
# 80: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 81: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 82: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
# 83: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 84: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 85: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
#
# ------------------------------------------------------------------------------------------------
#
# 86: Stock price difference between Day 0 (Day earnings call is released) and Day 10
# 87: Stock price difference between Day 0 (Day earnings call is released) and Day 30
# 88: Stock price difference between Day 0 (Day earnings call is released) and Day 50
# 89: Stock price difference between Day 0 (Day earnings call is released) and Day 70
# 90: Stock price difference between Day 0 (Day earnings call is released) and Day 90
#
# 91: Stock price difference between Day 1 (Day earnings call is released) and Day 10
# 92: Stock price difference between Day 1 (Day earnings call is released) and Day 30
# 93: Stock price difference between Day 1 (Day earnings call is released) and Day 50
# 94: Stock price difference between Day 1 (Day earnings call is released) and Day 70
# 95: Stock price difference between Day 1 (Day earnings call is released) and Day 90

# ------------------------------------------------------------------------------------------------
# 96: Market Cap

In [None]:
delete operator name

In [None]:
# https://www.spglobal.com/marketintelligence/en/news-insights/blog/analyzing-sentiment-in-quarterly-earnings-calls-q2-2022


# https://www.amenityanalytics.com/case-studies/earnings-call-transcript-analysis




#TF-IDF ----> from management sentences (Replies + pre-release)