In [23]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import numpy as np
from datetime import datetime
import yfinance as yf
import os
import glob
import regex as re
import csv


In [24]:
def get_transcript(path):
    mytranscript = pd.read_csv(path).iloc[[2]].values[0][0] 
    mytranscript = re.sub(r'[^A-Za-z0-9.,:!\'\n ]', '', mytranscript)
    mytranscript = re.sub('[^\S\n]+', ' ', mytranscript) #replaces multiple spaces to single space, without deleting newlines \n in the process
    mytranscript = mytranscript.splitlines() # finds transcript
    return mytranscript

def split_transcript(mytranscript):    
    transcript_safe_harbour, transcript_questions = "", ""
    for i in range(0, len(mytranscript)):
        speech_bubble = mytranscript[i].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space so the IF condition below can run smoothly
        # finds the following condition (what operator says) and splits the transcript into 2)
        if (i > 2) and (("operator:" in speech_bubble) and (("question" in speech_bubble) or ("go ahead" in speech_bubble) or ("operator instructions" in speech_bubble))):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break
    return transcript_safe_harbour, transcript_questions

def get_file_speaker_names(sector, stock):
    write_path = "sectors/"+sector+"/"+stock+"/"+"speaker names.csv"
    speaker_names = np.loadtxt(write_path, delimiter='\n', dtype=str)
    return speaker_names
    
# finds a list of analyst names for a single .csv file
def find_analyst_names(speaker_names, transcript_questions):
    analyst_names = []
    # the programme recognises the question is being asked by an analyst when the following conditions are met:
    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
        if "operator:" in speech_bubble:
            for name in speaker_names:
                namelist = name.split()
                if name.lower() != "operator": 
                    for name_2 in namelist: # cycle through each name in the name_list
                        name_2 = name_2.lower()
                        # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
                        # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
                        # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
                        if (name_2 in speech_bubble) and len(name_2) > 2:
                            analyst_names.append(name)
                    if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
                        analyst_names.append(name) 
                        
    analyst_names = list(set(analyst_names)) # replaces duplicates        
    return analyst_names

def get_analyst_management_sentences(analyst_names, transcript_questions):
    analyst_sentences = []
    management_sentences = []

    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index]
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        if speaker_name in analyst_names:
            analyst_sentences.append(speech_bubble)

        elif speaker_name.lower() == "operator":
            pass
            
        else:
            management_sentences.append(speech_bubble)

    return analyst_sentences, management_sentences


In [25]:
class TrieNode:
    def __init__(self):
        self.children = {}  # create an empty dictionary to store the children of the node
        self.is_word = False  # flag to indicate if the node represents the end of a word
        self.fail = None  # pointer to the fail node
        self.word = None  # word stored in the node

class AhoCorasick:
    def __init__(self, words):
        self.root = TrieNode()  # initialize the root of the Trie
        self.build_trie(words)  # build the Trie from the list of words
        self.build_ac_automata()  # build the AC automata from the Trie

    def build_trie(self, words):
        for word in words:
            node = self.root  # start at the root of the Trie
            for char in word:
                if char not in node.children:  # if the character is not in the children dictionary, create a new TrieNode
                    node.children[char] = TrieNode()
                node = node.children[char]  # move to the child node
            node.is_word = True  # set the is_word flag to indicate the end of a word
            node.word = word  # store the word in the node

    def build_ac_automata(self):
        queue = []  # initialize a queue to keep track of the nodes to visit

        for node in self.root.children.values():  # start with the children of the root node
            queue.append(node)  # add the child node to the queue
            node.fail = self.root  # set the fail pointer to the root

        while len(queue) > 0:
            node = queue.pop(0)  # get the next node to visit from the queue
            for char, child in node.children.items():  # iterate through the children of the current node
                queue.append(child)  # add the child to the queue
                fail_node = node.fail  # start at the fail node of the current node
                while fail_node is not None and char not in fail_node.children:  # follow the fail pointers until a match is found or the root is reached
                    fail_node = fail_node.fail
                if fail_node is None:
                    child.fail = self.root
                else:
                    child.fail = fail_node.children[char]
                child.is_word |= child.fail.is_word

    def search(self, text):
        node = self.root
        new_text = text
        for i, char in enumerate(text):
            while node is not None and char not in node.children:
                node = node.fail
            if node is None:
                node = self.root
                continue
            node = node.children[char]
            if node.is_word:
                new_text = new_text.replace(node.word, '')
        return new_text

# Usage
list1 = ['string1', 'string2', 'string3']
sentence = "string1 is string2 is string is string3 is string"
ac = AhoCorasick(list1)
new_sentence = ac.search(sentence) # deletes a particular string from new_sentence if that string is presnet in list1
new_sentence = re.sub('[^\S\n]+', ' ', new_sentence)
new_sentence

' is is string is is string'

In [6]:
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import numpy as np
import textstat


In [7]:
sentiment_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
sentiment_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [8]:
sentiment_nlp = pipeline("text-classification", model=sentiment_finbert, tokenizer=sentiment_tokenizer)

In [9]:
mystr = "I mean, I can comment a little bit about it. I mean, the corridor that we did very well in with Cuba and there is a I don't know how else to explain it, but there's a black market currency and a regular currency. And people are basically choosing to do business in cash in Cuba because they can buy way more on the black market versus paying for things here, where we have to obviously not do that and that's really the situation. And it's and again, it's not just for us, it's for all of our competitors as well. They are all seeing the same deterioration."
result = sentiment_nlp(mystr)
result

[{'label': 'Neutral', 'score': 0.9691150188446045}]

In [10]:
def map_sentiments(sentiment_result):
    sentiment_result = sentiment_result[0]
    if sentiment_result['label'] == 'Negative':
        return -1 * sentiment_result['score'], "negative"
    
    elif sentiment_result['label'] == 'Neutral':
        return 0, "neutral"
    
    elif sentiment_result['label'] == 'Positive':
        return sentiment_result['score'], "positive"

In [11]:
import nltk
nltk.download('punkt')

def split_paragraph_into_sentences(temp):
    sentences = nltk.sent_tokenize(temp)
    return sentences

[nltk_data] Downloading package punkt to /Users/victor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
import statistics


In [13]:
# FLS classification
fls_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
fls_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
fls_nlp = pipeline("text-classification", model=fls_finbert, tokenizer=fls_tokenizer)

In [14]:
def get_NLP_values(liststr):
    # further analysis includes finding sentiment and word complexity.
    if len(liststr) == 0:
        return 0, 0, 0
    
    else:
        # maps sentiment data so it outputs a single sentiment value
        sentiment_result = sentiment_nlp(liststr)
        # gets 
        sentiment_score = map_sentiments(sentiment_result)

        # word complexity:
        flesch_score = textstat.flesch_reading_ease(liststr)
        gunning_fog_score = textstat.gunning_fog(liststr)

        return sentiment_score, flesch_score, gunning_fog_score

In [40]:
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific

def getFeature5to22(pre_release, speaker_names):
    new_speaker_names = [word + ':' for word in speaker_names]

    ac = AhoCorasick(new_speaker_names)

    net_sentiment_list = []
    flesch_list = []

    n_flslist = []
    s_flslist = []
    ns_flslist = []

    net_positive = 0
    net_negative = 0
    net_neutral = 0

    feature_extract_5 = 0
    feature_extract_6 = 0
    feature_extract_7 = 0
    feature_extract_8 = 0
    feature_extract_9 = 0
    feature_extract_10 = 0
    feature_extract_11 = 0
    feature_extract_12 = 0
    feature_extract_13 = 0
    feature_extract_14 = 0
    feature_extract_15 = 0
    feature_extract_16 = 0
    feature_extract_17 = 0
    feature_extract_18 = 0
    feature_extract_19 = 0
    feature_extract_20 = 0
    feature_extract_21 = 0
    feature_extract_22 = 0

    try:
        for speech_bubble in pre_release:
            try:
                new_speech_bubble = ac.search(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                    # gets text complexity
                    flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                    flesch_list.append(flesch_score)

                    new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)
                    # print(new_speech_bubble_list)
                    fls_results = fls_nlp(new_speech_bubble_list)
                    # print(fls_results)
                    for i in range(0, len(new_speech_bubble_list)):
                        sentence = new_speech_bubble_list[i]
                        if fls_results[i]['label'] == 'Not FLS':
                            n_flslist.append(sentence)
                        elif fls_results[i]['label'] == 'Specific FLS':
                            s_flslist.append(sentence)
                        elif fls_results[i]['label'] == 'Non-specific FLS':                    
                            ns_flslist.append(sentence) 
                        
            except:
                pass

        try:
            feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, fls1_sentiment_list, net1_positive, net1_negative, net1_neutral = get_fls_features(s_flslist)
            
        except:
            pass

        try:
            feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, fls2_sentiment_list, net2_positive, net2_negative, net2_neutral = get_fls_features(ns_flslist)

        except:
            pass

        try:
            feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, fls3_sentiment_list, net3_positive, net3_negative, net3_neutral = get_fls_features(n_flslist)

        except:
            pass

        try:
            numb_s_flslist = len(s_flslist)
            numb_ns_flslist = len(ns_flslist)
            numb_n_flslist = len(n_flslist)
            total = numb_s_flslist + numb_ns_flslist + numb_n_flslist

            feature_extract_21 = numb_s_flslist/total
            feature_extract_22 = numb_ns_flslist/total

        except:
            pass

        net_positive = net1_positive + net2_positive + net3_positive
        net_negative = net1_negative + net2_negative + net3_negative
        net_neutral = net1_neutral + net2_neutral + net3_neutral
        net_sentiment_list = fls1_sentiment_list + fls2_sentiment_list + fls3_sentiment_list
        
        try:
            feature_extract_5 = statistics.mean(net_sentiment_list)
        except:
            pass

        try:
            feature_extract_6 = net_positive/(net_negative+net_positive+net_neutral)
            feature_extract_7 = net_negative/(net_negative+net_positive+net_neutral)
        except:
            pass

        try:
            feature_extract_8 = statistics.mean(flesch_list)
        except:
            pass

        fea_ext_list5to22 = [feature_extract_5, feature_extract_6, feature_extract_7, feature_extract_8, feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, feature_extract_21, feature_extract_22]

    except:
        return fea_ext_list5to22
    
    return fea_ext_list5to22


def get_fls_features(flslist):
    fls_sentiment_list = []
    net_positive = 0
    net_negative = 0
    net_neutral = 0
    
    feature_extract_1 = 0
    feature_extract_2 = 0
    feature_extract_3 = 0
    feature_extract_4 = 0

    for each_fls in flslist:
        sentiment_result = sentiment_nlp(each_fls)
        sentiment_score, positivity_value = map_sentiments(sentiment_result)
        fls_sentiment_list.append(sentiment_score)
        
        if positivity_value == "positive":
            net_positive += 1

        elif positivity_value == "negative":
            net_negative += 1

        else:
            net_neutral += 1
    try:
        feature_extract_1 = statistics.mean(fls_sentiment_list)
    except:
        pass
    try:
        feature_extract_2 = net_positive/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_3 = net_negative/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_4 = textstat.flesch_reading_ease(' '.join(flslist))
    except:
        pass

    return feature_extract_1, feature_extract_2, feature_extract_3, feature_extract_4, fls_sentiment_list, net_positive, net_negative, net_neutral


In [60]:
# Questions & Answers:
# 23. Whole Q&A - net sentiment
# 24. Whole Q&A – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 25. Whole Q&A – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 26. Whole Q&A - net word complexity
# 
# 27. all question (aggregate) - sentiment
# 28. all question (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 29. all question (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 30. all question (aggregate) - word complex
#
# 31. all reply (aggregate) - sentiment
# 32. all reply (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 33. all reply (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 34. all reply (aggregate) - word complex
#
# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 37. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 38. Specific foward looking statment - word complexity
#
# 39. Non Specific Forward looking statement - sentiment 
# 40. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 41. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 42. Non Specific Forward looking statement - word complexity
#
# 43. Not Foward looking statement - sentiment
# 44. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 45. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 46. Not Foward looking statement - word complexity


# 1. get rid of the speaker names from each "speech bubble" (i.e. analyst_speech, management_speech)

# 2. FOR loop - goes through each speech bubble, and parse them to sentences
#   2a. in the For loop - classify the sentences (S-FLS, NS-FLS, N-FLS)
#   2b. Creates x3 2D FLS lists where each 1D value of the list is ["the sentence", "is this sentence a question or a reply?"] (titled: s_FLS_2D, ns_FLS_2D, n_FLS_2D)

# 3. Goes through each FLS_2D list and find sentiment of each with the following conditions:
# 3a. if sentence is s_FLS
    # 3ai. if sentence is a "question":
        #- Sentiment value is appended to "question_sentiment_list"
        #- Word Complexity is appended to "question_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "s_fls_sentiment_list"
        #- Word Complexity is appended to "s_fls_complex_list"

    # 3ai. if sentence is a "reply":
        #- sentiment value is appended to "reply_sentiment_list"
        #- Word Complexity is appended to "reply_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "s_fls_sentiment_list"
        #- Word Complexity is appended to "s_fls_complex_list"


# 3b. if sentence is ns_FLS
    # 3bi. if sentence is a "question":
        #- Sentiment value is appended to "question_sentiment_list"
        #- Word Complexity is appended to "question_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "ns_fls_sentiment_list"
        #- Word Complexity is appended to "ns_fls_complex_list"

    # 3bi. if sentence is a "reply":
        #- sentiment value is appended to "reply_sentiment_list"
        #- Word Complexity is appended to "reply_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "ns_fls_sentiment_list"
        #- Word Complexity is appended to "ns_fls_complex_list"

# 3c. if sentence is n_FLS
    # 3ci. if sentence is a "question":
        #- Sentiment value is appended to "question_sentiment_list"
        #- Word Complexity is appended to "question_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "n_fls_sentiment_list"
        #- Word Complexity is appended to "n_fls_complex_list"

    # 3ci. if sentence is a "reply":
        #- sentiment value is appended to "reply_sentiment_list"
        #- Word Complexity is appended to "reply_complex_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #
        #- Sentiment value is appended to "n_fls_sentiment_list"
        #- Word Complexity is appended to "n_fls_complex_list"


def getFeature23to46(analyst_speech, management_speech, speaker_names):
    fea_ext_list23to46 = [0]*24
    
    # print(analyst_sentences, management_sentence)
    # get_fls_features(flslists)


    # 1. get rid of the speaker names from each "speech bubble" (i.e. analyst_speech, management_speech)

    # 2. FOR loop - goes through each speech bubble, and parse them to sentences
    #   2a. in the For loop - classify the sentences (S-FLS, NS-FLS, N-FLS)
    #   2b. Creates x3 2D FLS lists where each 1D value of the list is ["the sentence", "is this sentence a question or a reply?"] (titled: s_FLS_2D, ns_FLS_2D, n_FLS_2D)

    # 3. Goes through each FLS_2D list and find sentiment of each with the following conditions:
        # 3a. if sentence is s_FLS
            # 3ai. if sentence is a "question":
                #- Sentiment value is appended to "question_sentiment_list"
                #- Word Complexity is appended to "question_complex_list"
                #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
                #
                #- Sentiment value is appended to "s_fls_sentiment_list"
                #- Word Complexity is appended to "s_fls_complex_list"

            # 3ai. if sentence is a "reply":
                #- sentiment value is appended to "reply_sentiment_list"
                #- Word Complexity is appended to "reply_complex_list"
                #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
                #
                #- Sentiment value is appended to "s_fls_sentiment_list"
                #- Word Complexity is appended to "s_fls_complex_list"



    for i in range(0, len(new_speech_bubble_list)):
        sentence = new_speech_bubble_list[i]
        if fls_results[i]['label'] == 'Not FLS':
            n_flslist.append(sentence)
        elif fls_results[i]['label'] == 'Specific FLS':
            s_flslist.append(sentence)
        elif fls_results[i]['label'] == 'Non-specific FLS':                    
            ns_flslist.append(sentence) 


    fea_ext_list23to46[0] = 99999999
    
    return fea_ext_list23to46
    #for replies:
    # try:
    #     feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12 = get_fls_features(s_flslist)

    # except:
    #     pass

    # try:
    #     feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16 = get_fls_features(ns_flslist)

    # except:
    #     pass

    # try:
    #     feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20 = get_fls_features(n_flslist)

    # except:
    #     pass


In [59]:
fea_ext_list23to46 = getFeature23to46(analyst_sentences, management_sentences, speaker_names)
print(fea_ext_list23to46)

[99999999, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [41]:
sector = "tech"
stock = "AAPL"

path = "sectors/tech/AAPL/AAPL20224.csv"

mytranscript = get_transcript(path)
transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
speaker_names = get_file_speaker_names(sector, stock)
analyst_names = find_analyst_names(speaker_names, transcript_questions)

not_current_analyst_names = []

for names in speaker_names:
    if names not in analyst_names:
        not_current_analyst_names.append(names)
        
analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

fea_ext_list5to22 = getFeature5to22(transcript_safe_harbour, speaker_names)
print(fea_ext_list5to22)

fea_ext_list23to46 = getFeature23to46(analyst_sentences, management_sentences, speaker_names)
print(fea_ext_list23to46)


[0.46377516070077585, 0.5581395348837209, 0.08139534883720931, 61.514, -0.22222865952385795, 0.1111111111111111, 0.3333333333333333, 47.89, 0, 0.0, 0.0, 56.76, 0.5078843824611687, 0.5900621118012422, 0.06832298136645963, 60.35, 0.05232558139534884, 0.011627906976744186]


In [61]:
fea_ext_list5to22

[0.46377516070077585,
 0.5581395348837209,
 0.08139534883720931,
 61.514,
 -0.22222865952385795,
 0.1111111111111111,
 0.3333333333333333,
 47.89,
 0,
 0.0,
 0.0,
 56.76,
 0.5078843824611687,
 0.5900621118012422,
 0.06832298136645963,
 60.35,
 0.05232558139534884,
 0.011627906976744186]

In [None]:
new_speaker_names = [word + ':' for word in speaker_names]
mytranscript = mytranscript[0:-2]

ac = AhoCorasick(new_speaker_names)

for sentence in mytranscript:
    new_sentence = ac.search(sentence)
    new_sentence = re.sub('[^\S\n]+', ' ', new_sentence)
    if new_sentence[0] == " ": 
        new_sentence = new_sentence.replace(" ", "", 1) # replace the first space bar with an empty string
        # ' is is string is is string' to 'is is string is is string'
    print(new_sentence)
    break


In [None]:
sector = "semiconductors"
stock = "AEHR"

sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[1-9]**[1-9]*[1-4].*')
sector_files.sort(reverse=True)
for path in sector_files: # for every .csv path of that stock
    mytranscript = get_transcript(path)
    transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
    speaker_names = get_file_speaker_names(sector, stock)
    analyst_names = find_analyst_names(speaker_names, transcript_questions)
    break


In [None]:
# List of .csv file info for each stock:
# ------------------------------------------------------------------------------------------------
# META DATA
# ------------------------------------------------------------------------------------------------
#
# 0. Year of transcript release
# 1. Quarter of transcript release
# 2. Date of transcript release
# 3. Earnings Transcript contents
#
# ------------------------------------------------------------------------------------------------
# Feature Extractions:
# ------------------------------------------------------------------------------------------------
#
# 4. EPS surprise value
#
# ------------------------------------------------------------------------------------------------
# Transcript Features:
# ------------------------------------------------------------------------------------------------
#
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific
#
# ------------------------------------------------------------------------------------------------
#
# Questions & Answers:
# 23. Whole Q&A - net sentiment
# 24. Whole Q&A – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 25. Whole Q&A – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 26. Whole Q&A - net word complexity
# 
# 27. all question (aggregate) - sentiment
# 28. all question (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 29. all question (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 30. all question (aggregate) - word complex
#
# 31. all reply (aggregate) - sentiment
# 32. all reply (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 33. all reply (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 34. all reply (aggregate) - word complex
#
# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 37. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 38. Specific foward looking statment - word complexity
#
# 39. Non Specific Forward looking statement - sentiment 
# 40. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 41. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 42. Non Specific Forward looking statement - word complexity
#
# 43. Not Foward looking statement - sentiment
# 44. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 45. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 46. Not Foward looking statement - word complexity
#
# ------------------------------------------------------------------------------------------------
#
# With specific words analysis:
# Sentences that includes the word:
# all of these words can be plural (e.g. cost and costs)
# 47: "margin" - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 48: "cost" - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 49: "revenue" - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 50: "earnings" - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 51: "growth" - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 52: "EBIDTA" -  #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 53: "leverage" -  #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 54: "debt" -  #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 55: "price" – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# ------------------------------------------------------------------------------------------------
#
# 56: Rolling frame of Whole Transcripts - TF-IDF value
# 57: Rolling frame of Pre-releases - TF-IDF value
# 58: Rolling frame of Management Sentences (Their Replies) - TF-IDF
#
# ------------------------------------------------------------------------------------------------
#
# 59: Stock price difference between Day 0 (Day earnings call is released) and Day 10
# 60: Stock price difference between Day 0 (Day earnings call is released) and Day 20
# 61: Stock price difference between Day 0 (Day earnings call is released) and Day 30
# 62: Stock price difference between Day 0 (Day earnings call is released) and Day 40
# 63: Stock price difference between Day 0 (Day earnings call is released) and Day 50
# 64: Stock price difference between Day 0 (Day earnings call is released) and Day 60
# 65: Stock price difference between Day 0 (Day earnings call is released) and Day 70
# 66: Stock price difference between Day 0 (Day earnings call is released) and Day 80
# 67: Stock price difference between Day 0 (Day earnings call is released) and Day 90
#
# 68: Stock price difference between Day 1 (Day earnings call is released) and Day 10
# 69: Stock price difference between Day 1 (Day earnings call is released) and Day 20
# 70: Stock price difference between Day 1 (Day earnings call is released) and Day 30
# 71: Stock price difference between Day 1 (Day earnings call is released) and Day 40
# 72: Stock price difference between Day 1 (Day earnings call is released) and Day 50
# 73: Stock price difference between Day 1 (Day earnings call is released) and Day 60
# 74: Stock price difference between Day 1 (Day earnings call is released) and Day 70
# 75: Stock price difference between Day 1 (Day earnings call is released) and Day 80
# 76: Stock price difference between Day 1 (Day earnings call is released) and Day 90


In [None]:
delete operator name

In [None]:
# https://www.spglobal.com/marketintelligence/en/news-insights/blog/analyzing-sentiment-in-quarterly-earnings-calls-q2-2022


# https://www.amenityanalytics.com/case-studies/earnings-call-transcript-analysis




#TF-IDF ----> from management sentences (Replies + pre-release)