In [347]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import numpy as np
from datetime import datetime
import yfinance as yf
import os
import glob
import regex as re
import csv


In [405]:
def get_transcript(path):
    mytranscript = pd.read_csv(path).iloc[[2]].values[0][0] 
    mytranscript = re.sub(r'[^A-Za-z0-9.,:!\'\n ]', '', mytranscript)
    mytranscript = re.sub('[^\S\n]+', ' ', mytranscript) #replaces multiple spaces to single space, without deleting newlines \n in the process
    mytranscript = mytranscript.splitlines() # finds transcript
    return mytranscript

def split_transcript(mytranscript):    
    transcript_safe_harbour, transcript_questions = "", ""
    for i in range(0, len(mytranscript)):
        speech_bubble = mytranscript[i].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space so the IF condition below can run smoothly
        # finds the following condition (what operator says) and splits the transcript into 2)
        if (i > 2) and (("operator:" in speech_bubble) and (("question" in speech_bubble) or ("go ahead" in speech_bubble) or ("operator instructions" in speech_bubble))):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break
        elif (i > 2 ) and ("operator" in speech_bubble) and ("question" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i+1]
            transcript_questions = mytranscript[i+1:]
            break
        elif (i > 2 ) and ("operator:" in speech_bubble) and ("first" in speech_bubble):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break


    return transcript_safe_harbour, transcript_questions

def get_file_speaker_names(sector, stock):
    write_path = "sectors/"+sector+"/"+stock+"/"+"speaker names.csv"
    speaker_names = np.loadtxt(write_path, delimiter='\t', dtype=str)
    return speaker_names
    
# finds a list of analyst names for a single .csv file
def find_analyst_names(speaker_names, transcript_questions):
    analyst_names = []
    # the programme recognises the question is being asked by an analyst when the following conditions are met:
    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
        if "operator:" in speech_bubble:
            for name in speaker_names:
                namelist = name.split()
                if name.lower() != "operator": 
                    for name_2 in namelist: # cycle through each name in the name_list
                        name_2 = name_2.lower()
                        # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
                        # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
                        # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
                        if (name_2 in speech_bubble) and len(name_2) > 2:
                            analyst_names.append(name)
                    if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
                        analyst_names.append(name) 
                        
    analyst_names = list(set(analyst_names)) # replaces duplicates        
    return analyst_names

def get_analyst_management_sentences(analyst_names, transcript_questions):
    analyst_sentences = []
    management_sentences = []

    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index]
        colon_pos = speech_bubble.find(":")
        speaker_name = speech_bubble[:colon_pos]
        if speaker_name in analyst_names:
            analyst_sentences.append(speech_bubble)

        elif speaker_name.lower() == "operator":
            pass
            
        else:
            management_sentences.append(speech_bubble)

    return analyst_sentences, management_sentences


In [406]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_word = False
        self.fail = None
        self.word = None

class AhoCorasick:
    def __init__(self, words):
        self.root = TrieNode()
        self.build_trie(words)
        self.build_ac_automata()

    def build_trie(self, words):
        for word in words:
            node = self.root
            for char in word:
                if char not in node.children:
                    node.children[char] = TrieNode()
                node = node.children[char]
            node.is_word = True
            node.word = word

    def build_ac_automata(self):
        queue = []

        for node in self.root.children.values():
            queue.append(node)
            node.fail = self.root

        while len(queue) > 0:
            node = queue.pop(0)
            for char, child in node.children.items():
                queue.append(child)
                fail_node = node.fail
                while fail_node is not None and char not in fail_node.children:
                    fail_node = fail_node.fail
                if fail_node is None:
                    child.fail = self.root
                else:
                    child.fail = fail_node.children[char]
                child.is_word |= child.fail.is_word

    def remove_words(self, text):
        node = self.root
        new_text = text
        for i, char in enumerate(text):
            while node is not None and char not in node.children:
                node = node.fail
            if node is None:
                node = self.root
                continue
            node = node.children[char]
            if node.is_word:
                new_text = new_text.replace(node.word, '')
        return new_text


# Usage
list1 = ['string1', 'string2', 'string3']
sentence = "string1 is string2 is string is string3 is string"
ac = AhoCorasick(list1)
new_sentence = ac.remove_words(sentence) # deletes a particular string from new_sentence if that string is presnet in list1
new_sentence = re.sub('[^\S\n]+', ' ', new_sentence)
new_sentence

' is is string is is string'

In [3]:
!pip install --upgrade "protobuf<=3.20.1"

Collecting protobuf<=3.20.1
  Downloading protobuf-3.20.1-py2.py3-none-any.whl (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.1/162.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: protobuf
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-macos 2.9.2 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.
tensorboard 2.9.1 requires protobuf<3.20,>=3.9.2, but you have protobuf 3.20.1 which is incompatible.
grpcio-status 1.51.1 requires protobuf>=4.21.6, but you have protobuf 3.20.1 which is incompatible.
google-cloud-firestore 2.9.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.20.1 which is incompatible.
google-api-core 2.11.0 requires protobuf!=3.20.0

In [351]:
import transformers
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch
import numpy as np
import textstat

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


In [352]:
sentiment_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
sentiment_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

In [353]:
sentiment_nlp = pipeline("text-classification", model=sentiment_finbert, tokenizer=sentiment_tokenizer)

In [354]:
mystr = "I mean, I can comment a little bit about it. I mean, the corridor that we did very well in with Cuba and there is a I don't know how else to explain it, but there's a black market currency and a regular currency. And people are basically choosing to do business in cash in Cuba because they can buy way more on the black market versus paying for things here, where we have to obviously not do that and that's really the situation. And it's and again, it's not just for us, it's for all of our competitors as well. They are all seeing the same deterioration."
result = sentiment_nlp(mystr)
result

[{'label': 'Neutral', 'score': 0.9691150784492493}]

In [355]:
def map_sentiments(sentiment_result):
    sentiment_result = sentiment_result[0]
    if sentiment_result['label'] == 'Negative':
        return -1 * sentiment_result['score'], "negative"
    
    elif sentiment_result['label'] == 'Neutral':
        return 0, "neutral"
    
    elif sentiment_result['label'] == 'Positive':
        return sentiment_result['score'], "positive"

In [356]:
import nltk
nltk.download('punkt')

def split_paragraph_into_sentences(temp):
    sentences = nltk.sent_tokenize(temp)
    return sentences

[nltk_data] Downloading package punkt to /Users/victor/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [357]:
import statistics


In [358]:
# FLS classification
fls_finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
fls_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
fls_nlp = pipeline("text-classification", model=fls_finbert, tokenizer=fls_tokenizer)

In [359]:
def get_NLP_values(liststr):
    # further analysis includes finding sentiment and word complexity.
    if len(liststr) == 0:
        return 0, 0, 0
    
    else:
        # maps sentiment data so it outputs a single sentiment value
        sentiment_result = sentiment_nlp(liststr)
        # gets 
        sentiment_score = map_sentiments(sentiment_result)

        # word complexity:
        flesch_score = textstat.flesch_reading_ease(liststr)
        gunning_fog_score = textstat.gunning_fog(liststr)

        return sentiment_score, flesch_score, gunning_fog_score

In [407]:
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific

def getFeature5to22(pre_release, speaker_names):
    new_speaker_names = [word + ':' for word in speaker_names]

    ac = AhoCorasick(new_speaker_names)

    net_sentiment_list = []
    flesch_list = []

    n_flslist = []
    s_flslist = []
    ns_flslist = []

    net_positive = 0
    net_negative = 0
    net_neutral = 0

    feature_extract_5 = 0
    feature_extract_6 = 0
    feature_extract_7 = 0
    feature_extract_8 = 0
    feature_extract_9 = 0
    feature_extract_10 = 0
    feature_extract_11 = 0
    feature_extract_12 = 0
    feature_extract_13 = 0
    feature_extract_14 = 0
    feature_extract_15 = 0
    feature_extract_16 = 0
    feature_extract_17 = 0
    feature_extract_18 = 0
    feature_extract_19 = 0
    feature_extract_20 = 0
    feature_extract_21 = 0
    feature_extract_22 = 0

    try:
        for speech_bubble in pre_release:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                flesch_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                fls_results = fls_nlp(new_speech_bubble_list)
                
                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    if fls_results[i]['label'] == 'Not FLS':
                        n_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Specific FLS':
                        s_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Non-specific FLS':                    
                        ns_flslist.append(sentence) 
                        
            except:
                pass

        try:
            feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, fls1_sentiment_list, net1_positive, net1_negative, net1_neutral = get_fls_features(s_flslist)
            
        except:
            pass

        try:
            feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, fls2_sentiment_list, net2_positive, net2_negative, net2_neutral = get_fls_features(ns_flslist)

        except:
            pass

        try:
            feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, fls3_sentiment_list, net3_positive, net3_negative, net3_neutral = get_fls_features(n_flslist)

        except:
            pass

        try:
            numb_s_flslist = len(s_flslist)
            numb_ns_flslist = len(ns_flslist)
            numb_n_flslist = len(n_flslist)
            total = numb_s_flslist + numb_ns_flslist + numb_n_flslist

            feature_extract_21 = numb_s_flslist/total
            feature_extract_22 = numb_ns_flslist/total

        except:
            pass

        net_positive = net1_positive + net2_positive + net3_positive
        net_negative = net1_negative + net2_negative + net3_negative
        net_neutral = net1_neutral + net2_neutral + net3_neutral
        net_sentiment_list = fls1_sentiment_list + fls2_sentiment_list + fls3_sentiment_list
        
        try:
            feature_extract_5 = statistics.mean(net_sentiment_list)
        except:
            pass

        try:
            feature_extract_6 = net_positive/(net_negative+net_positive+net_neutral)
            feature_extract_7 = net_negative/(net_negative+net_positive+net_neutral)
        except:
            pass

        try:
            feature_extract_8 = statistics.mean(flesch_list)
        except:
            pass

        fea_ext_list5to22 = [feature_extract_5, feature_extract_6, feature_extract_7, feature_extract_8, feature_extract_9, feature_extract_10, feature_extract_11, feature_extract_12, feature_extract_13, feature_extract_14, feature_extract_15, feature_extract_16, feature_extract_17, feature_extract_18, feature_extract_19, feature_extract_20, feature_extract_21, feature_extract_22]

    except:
        return fea_ext_list5to22
    
    return fea_ext_list5to22


def get_fls_features(flslist):
    fls_sentiment_list = []
    net_positive = 0
    net_negative = 0
    net_neutral = 0
    
    feature_extract_1 = 0
    feature_extract_2 = 0
    feature_extract_3 = 0
    feature_extract_4 = 0

    for each_fls in flslist:
        sentiment_result = sentiment_nlp(each_fls)
        sentiment_score, positivity_value = map_sentiments(sentiment_result)
        fls_sentiment_list.append(sentiment_score)
        
        if positivity_value == "positive":
            net_positive += 1

        elif positivity_value == "negative":
            net_negative += 1

        else:
            net_neutral += 1
    try:
        feature_extract_1 = statistics.mean(fls_sentiment_list)
    except:
        pass
    try:
        feature_extract_2 = net_positive/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_3 = net_negative/(net_positive+net_negative+net_neutral)
    except:
        pass
    try:
        feature_extract_4 = textstat.flesch_reading_ease(' '.join(flslist))
    except:
        pass

    return feature_extract_1, feature_extract_2, feature_extract_3, feature_extract_4, fls_sentiment_list, net_positive, net_negative, net_neutral


In [408]:
# Questions & Answers:
# 23. Whole Q&A - net sentiment "net_sentiment_list"
# 24. Whole Q&A – #of_NETpositive(sentiment)/#of_NETnegative+NETpositive+NETneutral(sentiment)
# 25. Whole Q&A – #of_NETnegative(sentiment)/#of_NETnegative+NETpositive+NETneutral(sentiment)
# 26. Whole Q&A - net word complexity "net_text_complex_list"
# 
# 27. all question (aggregate) - sentiment "question_sentiment_list"
# 28. all question (aggregate) – #of_Qpositive(sentiment)/#of_Qnegative+Qpositive+Qneutral(sentiment)
# 29. all question (aggregate) – #of_Qnegative(sentiment)/#of_Qnegative+Qpositive+Qneutral(sentiment)
# 30. all question (aggregate) - net word complexity "questions_complex_list"
#
# 31. all reply (aggregate) - sentiment "reply_sentiment_list"
# 32. all reply (aggregate) – #of_Rpositive(sentiment)/#of_Rnegative+Rpositive+Rneutral(sentiment)
# 33. all reply (aggregate) – #of_Rnegative(sentiment)/#of_Rnegative+Rpositive+Rneutral(sentiment)
# 34. all reply (aggregate) - net word complexity "reply_complex_list"


# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_SFLSpositive(sentiment)/#of_SFLSnegative+SFLSpositive+SFLSneutral(sentiment)
# 37. Specific foward looking statment - #of_SFLSnegative(sentiment)/#of_SFLSnegative+SFLSpositive+SFLSneutral(sentiment)
#
# 38. Non Specific Forward looking statement - sentiment 
# 39. Non Specific Forward looking statement - #of_NSFLSpositive(sentiment)/#of_NSFLSnegative+NSFLSpositive+NSFLSneutral(sentiment)
# 40. Non Specific Forward looking statement - #of_NSFLSnegative(sentiment)/#of_NSFLSnegative+NSFLSpositive+NSFLSneutral(sentiment)
#
# 41. Not Foward looking statement - sentiment
# 42. Not Foward looking statement - #of_NFLSpositive(sentiment)/#of_NFLSnegative+NFLSpositive+NFLSneutral(sentiment)
# 43. Not Foward looking statement - #of_NFLSnegative(sentiment)/#of_NFLSnegative+NFLSpositive+NFLSneutral(sentiment)


# 1. get rid of the speaker names from each "speech bubble" (i.e. analyst_speech, management_speech)

# 2. FOR loop of each speech bubble
    # 2a. Parse them to sentences:
    # 2b. For each in sentences:
        #- classify the sentences (S-FLS, NS-FLS, N-FLS)
        #- Creates x3 FLS lists 
    # 2c. finds text complexity of each speech bubble
    # 2d. adds text complexity value of each speech buble to net_text_complex_list 

# 3. Goes through each FLS list and find sentiment of each with the following conditions:
    # 3a. if sentence is s_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- SFLSnet_positive, SFLSnet_negative, or SFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "s_fls_sentiment_list"

    # 3b. if sentence is ns_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- NSFLSnet_positive, NSFLSnet_negative, or NSFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "ns_fls_sentiment_list"

    # 3c. if sentence is n_FLS
        #- sentiment value is appended to "reply_sentiment_list"
        #- net_positive, net_negative, or net_neutral += 1 (total number of sentences with positive sentiments)
        #- NFLSnet_positive, NFLSnet_negative, or NFLSnet_neutral += 1 (total number of sentences with positive sentiments)
        #- Sentiment value is appended to "n_fls_sentiment_list"

# 4. finds text complexity of s_FLS, ns_FLS, n_FLS

# 5. get features based on the mean of relevant lists

def getFeature23to43(analyst_speech, management_speech, speaker_names):
    new_speaker_names = [word + ':' for word in speaker_names]
    ac = AhoCorasick(new_speaker_names)

    n_flslist = []
    s_flslist = []
    ns_flslist = []

    questions_complex_list = []
    reply_complex_list = []
    net_text_complex_list  = []

    # list of sentiments for all S_FLS, N_FLS, NS_FLS classes
    s_fls_sentiment_list = []
    n_fls_sentiment_list = []
    ns_fls_sentiment_list = []

    # list of sentiments for all sentences that are identified as a "question"
    question_sentiment_list = []

    # list of sentiments for all sentences that are identified as a "reply"
    reply_sentiment_list = []

    # list of sentiments for all sentences in the Q&A section
    net_sentiment_list = []

    net_positive = 0
    net_negative = 0
    net_neutral = 0

    Qpositive = 0
    Qnegative = 0
    Qneutral = 0

    Rpositive = 0
    Rnegative = 0
    Rneutral = 0

    SFLSpositive = 0
    SFLSnegative = 0
    SFLSneutral = 0

    NSFLSpositive = 0
    NSFLSnegative = 0
    NSFLSneutral = 0

    NFLSpositive = 0
    NFLSnegative = 0
    NFLSneutral = 0 

    feature_extract_23 = 0
    feature_extract_24 = 0
    feature_extract_25 = 0
    feature_extract_26 = 0

    feature_extract_27 = 0
    feature_extract_28 = 0
    feature_extract_29 = 0
    feature_extract_30 = 0

    feature_extract_31 = 0
    feature_extract_32 = 0
    feature_extract_33 = 0
    feature_extract_34 = 0

    feature_extract_35 = 0
    feature_extract_36 = 0
    feature_extract_37 = 0
    
    feature_extract_38 = 0
    feature_extract_39 = 0
    feature_extract_40 = 0
    
    feature_extract_41 = 0
    feature_extract_42 = 0
    feature_extract_43 = 0

    try:
        for speech_bubble in analyst_speech:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                questions_complex_list.append(flesch_score)
                net_text_complex_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    sentiment_result = sentiment_nlp(sentence)
                    sentiment_score, positivity_value = map_sentiments(sentiment_result)
                    question_sentiment_list.append(sentiment_score)
                    net_sentiment_list.append(sentiment_score)

                    if positivity_value == "positive":
                        net_positive += 1
                        Qpositive += 1

                    elif positivity_value == "negative":
                        net_negative += 1
                        Qnegative += 1

                    else:
                        net_neutral += 1
                        Qneutral += 1
            except:
                pass
        
        for speech_bubble in management_speech:
            try:
                new_speech_bubble = ac.remove_words(speech_bubble)
                new_speech_bubble = re.sub('[^\S\n]+', ' ', new_speech_bubble)

                if new_speech_bubble[0] == " ": 
                    new_speech_bubble = new_speech_bubble.replace(" ", "", 1) # replace the first space bar with an empty string, for example ' is is string is is string' to 'is is string is is string'
                    
                # gets text complexity
                flesch_score = textstat.flesch_reading_ease(new_speech_bubble)
                reply_complex_list.append(flesch_score)
                net_text_complex_list.append(flesch_score)

                new_speech_bubble_list = split_paragraph_into_sentences(new_speech_bubble)

                fls_results = fls_nlp(new_speech_bubble_list)
                
                for i in range(0, len(new_speech_bubble_list)):
                    sentence = new_speech_bubble_list[i]
                    if fls_results[i]['label'] == 'Not FLS':
                        n_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Specific FLS':
                        s_flslist.append(sentence)
                    elif fls_results[i]['label'] == 'Non-specific FLS':                    
                        ns_flslist.append(sentence)
                    
            except:
                pass
                
        # for "n_flslist":
        n_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NFLSpositive, NFLSnegative, NFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, n_flslist, n_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NFLSpositive, NFLSnegative, NFLSneutral, net_positive, net_negative, net_neutral)
        
        # for "s_flslist":
        s_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, SFLSpositive, SFLSnegative, SFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, s_flslist, s_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, SFLSpositive, SFLSnegative, SFLSneutral, net_positive, net_negative, net_neutral)
        
        # for "ns_flslist":
        ns_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NSFLSpositive, NSFLSnegative, NSFLSneutral, net_positive, net_negative, net_neutral = get_SentimentLists_from_FLS(net_sentiment_list, ns_flslist, ns_fls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, NSFLSpositive, NSFLSnegative, NSFLSneutral, net_positive, net_negative, net_neutral)

        feature_extract_23 = statistics.mean(net_sentiment_list)

        try:
            feature_extract_24 = net_positive/(net_positive+net_negative+net_neutral)
            feature_extract_25 = net_negative/(net_positive+net_negative+net_neutral)
        except:
            pass
        feature_extract_26 = statistics.mean(net_text_complex_list)
        
        feature_extract_27 = statistics.mean(question_sentiment_list)
        try:
            feature_extract_28 = Qpositive/(Qpositive+Qnegative+Qneutral)
            feature_extract_29 = Qnegative/(Qpositive+Qnegative+Qneutral)
        except:
            pass
        feature_extract_30 = statistics.mean(questions_complex_list)

        feature_extract_31 = statistics.mean(reply_sentiment_list)
        try:
            feature_extract_32 = Rpositive/(Rpositive+Rnegative+Rneutral)
            feature_extract_33 = Rnegative/(Rpositive+Rnegative+Rneutral)
        except:
            pass
        feature_extract_34 = statistics.mean(reply_complex_list)

        feature_extract_35 = statistics.mean(s_fls_sentiment_list)
        try:
            feature_extract_36 = SFLSpositive/(SFLSpositive+SFLSnegative+SFLSneutral)
            feature_extract_37 = SFLSnegative/(SFLSpositive+SFLSnegative+SFLSneutral)
        except:
            pass
        
        feature_extract_38 = statistics.mean(ns_fls_sentiment_list)
        try:
            feature_extract_39 = NSFLSpositive/(NSFLSpositive+NSFLSnegative+NSFLSneutral)
            feature_extract_40 = NSFLSnegative/(NSFLSpositive+NSFLSnegative+NSFLSneutral)
        except:
            pass
        
        feature_extract_41 = statistics.mean(n_fls_sentiment_list)
        try:
            feature_extract_42 = NFLSpositive/(NFLSpositive+NFLSnegative+NFLSneutral)
            feature_extract_43 = NFLSnegative/(NFLSpositive+NFLSnegative+NFLSneutral)
        except:
            pass

    except:
        pass
    return [feature_extract_23, feature_extract_24, feature_extract_25, feature_extract_26, feature_extract_27, feature_extract_28, feature_extract_29, feature_extract_30, feature_extract_31, feature_extract_32, feature_extract_33, feature_extract_34, feature_extract_35, feature_extract_36, feature_extract_37, feature_extract_38, feature_extract_39, feature_extract_40, feature_extract_41, feature_extract_42, feature_extract_43]

def get_SentimentLists_from_FLS(net_sentiment_list, THIS_flslist, THISfls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, FLSpositive, FLSnegative, FLSneutral, net_positive, net_negative, net_neutral):
    for each_fls_sentence in THIS_flslist:
        sentiment_result = sentiment_nlp(each_fls_sentence)
        sentiment_score, positivity_value = map_sentiments(sentiment_result)
        THISfls_sentiment_list.append(sentiment_score)
        reply_sentiment_list.append(sentiment_score)
        net_sentiment_list.append(sentiment_score)
        
        if positivity_value == "positive":
            net_positive += 1
            FLSpositive += 1
            Rpositive += 1
        elif positivity_value == "negative":
            net_negative += 1
            FLSnegative += 1
            Rnegative += 1

        else:
            net_neutral += 1
            FLSneutral += 1
            Rneutral += 1

    return THISfls_sentiment_list, reply_sentiment_list, Rpositive, Rnegative, Rneutral, FLSpositive, FLSnegative, FLSneutral, net_positive, net_negative, net_neutral


In [409]:
# With specific words analysis:
# Sentences that includes the word:
# all of these words can be plural (e.g. cost and costs)
# 44: "margin" - average sentiment
# 45: "cost" - average sentiment
# 46: "revenue" - average sentiment
# 47: "earnings or EBIDTA" - average sentiment
# 48: "growth" - average sentiment
# 49: "leverage or debt" -  average sentiment
# 50: "price" – average sentiment
# 51: "operation" - average sentiment 

def deepCleanTranscript(mytranscript):
    updatedTranscript = ' '.join(mytranscript)
    new_speaker_names = [word + ':' for word in speaker_names]
    ac = AhoCorasick(new_speaker_names)

    updatedTranscript = ac.remove_words(updatedTranscript)

    if updatedTranscript[0] == " ": 
        updatedTranscript = updatedTranscript.replace(" ", "", 1)

    updatedTranscript = re.sub('[^\S\n]+', ' ', updatedTranscript)
    updatedTranscript.lower()

    return updatedTranscript

def getFeature44to51(mytranscript, speaker_names):
    marginSentimentList = []
    costSentimentList = []
    revenueSentimentList = []
    earningsEBIDTASentimentList = []
    growthSentimentList = []
    leverageDebtSentimentList = []
    priceSentimentList = []
    operationSentimentList = []

    updatedTranscript = deepCleanTranscript(mytranscript)
    
    updatedTranscriptList = split_paragraph_into_sentences(updatedTranscript)


    for mysentence in updatedTranscriptList:
        if (" margin" in mysentence) or (" return" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            marginSentimentList.append(sentiment_score)

        if " cost" in mysentence:
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            costSentimentList.append(sentiment_score)

        if (" revenue" in mysentence) or (" top line" in mysentence) or (" sales" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            revenueSentimentList.append(sentiment_score)

        if (" earning" in mysentence) or (" EBIDTA" in mysentence) or (" profit" in mysentence) or (" bottom line" in mysentence) or (" net income" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            earningsEBIDTASentimentList.append(sentiment_score)

        if (" growth" in mysentence) or (" organic" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            growthSentimentList.append(sentiment_score)

        if (" leverage" in mysentence) or (" debt" in mysentence):
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            leverageDebtSentimentList.append(sentiment_score)

        if " price" in mysentence:
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            priceSentimentList.append(sentiment_score)

        if " operation" in mysentence:
            sentiment_result = sentiment_nlp(mysentence)
            sentiment_score, positivity_value = map_sentiments(sentiment_result)
            operationSentimentList.append(sentiment_score)


    feature_extract_44 = statistics.mean(marginSentimentList)

    feature_extract_45 = statistics.mean(costSentimentList)

    feature_extract_46 = statistics.mean(revenueSentimentList)

    feature_extract_47 = statistics.mean(earningsEBIDTASentimentList)

    feature_extract_48 = statistics.mean(growthSentimentList)

    feature_extract_49 = statistics.mean(leverageDebtSentimentList)

    feature_extract_50 = statistics.mean(priceSentimentList)

    feature_extract_51 = statistics.mean(operationSentimentList)

    return [feature_extract_44, feature_extract_45, feature_extract_46, feature_extract_47, feature_extract_48, feature_extract_49, feature_extract_50, feature_extract_51]


In [232]:
#testing TFIDF

rolling_path_of_four_transcript = [
    'sectors/tech/AAPL/AAPL20224.csv', 
    'sectors/tech/AAPL/AAPL20223.csv',
    'sectors/tech/AAPL/AAPL20222.csv',
    'sectors/tech/AAPL/AAPL20221.csv'
    ]

WHOLE_rolling_frame_of_four_transcripts = []
PRERELEASE_rolling_frame_of_four_transcripts = []
MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts = []
ANALYST_SENTENCES_rolling_frame_of_four_transcripts = []


for path in rolling_path_of_four_transcript:
    wholeTranscript = get_transcript(path)
    transcript_safe_harbour, transcript_questions = split_transcript(wholeTranscript)
    speaker_names = get_file_speaker_names(sector, stock)
    analyst_names = find_analyst_names(speaker_names, transcript_questions)
    not_current_analyst_names = []
    for names in speaker_names:
        if names not in analyst_names:
            not_current_analyst_names.append(names)
    analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

    wholeTranscript = deepCleanTranscript(wholeTranscript)
    transcript_safe_harbour = deepCleanTranscript(transcript_safe_harbour)
    management_sentences = deepCleanTranscript(management_sentences)
    analyst_sentences = deepCleanTranscript(analyst_sentences)

    WHOLE_rolling_frame_of_four_transcripts.append(wholeTranscript)
    PRERELEASE_rolling_frame_of_four_transcripts.append(transcript_safe_harbour)
    MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts.append(management_sentences)
    ANALYST_SENTENCES_rolling_frame_of_four_transcripts.append(analyst_sentences)

In [410]:
custom_stop_words = ['thanks', 'thank', 'really', 'said', 'say', 'yes', 'no', 've', 'll', 'don']
all_stop_words = list(ENGLISH_STOP_WORDS) + custom_stop_words

def tf_idf(transcript):
    #1. Removes stop words, 2. finds tf.idf value, used as a weight
    vectoriser = TfidfVectorizer(
        lowercase=True,
        max_features=100,
        ngram_range=(1, 3), # 1 to trigram as they are all common in finance (i.e. earnings per share, free cash flow etc.)
        stop_words=all_stop_words # removes stop words (i.e. irrevelant day to day words)
    )
    #vectorises tfidf values into a vector
    tfidf_vec = vectoriser.fit_transform(transcript)
    
    feature_names = vectoriser.get_feature_names()
    for i, value in enumerate(tfidf_vec[0].toarray()[0]):
        if value > 0:
            print(f"{feature_names[i]}:{value}")

    return tfidf_vec


In [234]:
value = tf_idf(MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts)
print(value)

accounts:0.11057921061693071
affected:0.08846336849354458
ago:0.06634752637015844
apple:0.1988926509041894
base:0.018081150082199036
basis:0.12656805057539325
basis points:0.08846336849354458
believe:0.06634752637015844
better:0.018081150082199036
billion:0.05424345024659711
business:0.10848690049319422
china:0.03616230016439807
company:0.07232460032879615
continue:0.018081150082199036
course:0.10848690049319422
customers:0.10848690049319422
demand:0.10848690049319422
did:0.03616230016439807
different:0.07232460032879615
difficult:0.07232460032879615
environment:0.018081150082199036
exchange:0.15481089486370303
fact:0.05424345024659711
feel:0.05424345024659711
foreign:0.11057921061693071
foreign exchange:0.11057921061693071
fx:0.24585718366210368
going:0.09040575041099518
good:0.16273035073979134
great:0.07232460032879615
gross:0.05424345024659711
growth:0.05424345024659711
guidance:0.05424345024659711
impact:0.04423168424677229
important:0.022115842123386145
installed:0.01808115008219



In [238]:
cos = linear_kernel(value) # finds the cosine similarity matrix
print(cos) # cosine similarity matrix

print(cos[0,1]) #(compares the first transcript with the second transcript)
print(cos[0,2]) #(compares the first transcript with the third transcript)
print(cos[0,3]) #(compares the first transcript with the fourth transcript)

[[1.         0.71759754 0.67172592 0.5988458 ]
 [0.71759754 1.         0.71606826 0.59629496]
 [0.67172592 0.71606826 1.         0.67063243]
 [0.5988458  0.59629496 0.67063243 1.        ]]
0.7175975419659856
0.6717259248110721
0.5988458021196891


In [327]:
# 52: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 2nd transcript
# 53: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 3rd transcript
# 54: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 4th transcript
# 55: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 2nd transcript
# 56: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 3rd transcript
# 57: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 4th transcript
# 58: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 59: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 60: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
# 61: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 62: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 63: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript


def find_cosineSimilarity(thistranscript):
    tf_idf_value = tf_idf(thistranscript)
    cosineMatrix = linear_kernel(tf_idf_value) # finds the cosine similarity matrix

    cos1_2 = cosineMatrix[0,1] # (compares the first transcript with the second transcript)
    cos1_3 = cosineMatrix[0,2] # (compares the first transcript with the third transcript)
    cos1_4 = cosineMatrix[0,3] # (compares the first transcript with the fourth transcript)

    return cos1_2, cos1_3, cos1_4

def tf_idf(transcript):
    custom_stop_words = ['thanks', 'thank', 'really', 'said', 'say', 'yes', 'no', 've', 'll', 'don']
    all_stop_words = list(ENGLISH_STOP_WORDS) + custom_stop_words

    vectoriser = TfidfVectorizer(
        lowercase=True,
        max_features=100,
        ngram_range=(1, 3), # 1 to trigram as they are all common in finance (i.e. earnings per share, free cash flow etc.)
        stop_words=all_stop_words # removes stop words (i.e. irrevelant day to day words)
    )

    #vectorises tfidf values into a vector
    tfidf_vec = vectoriser.fit_transform(transcript)

    return tfidf_vec
    
def getFeature52to63(sector, stock, rolling_path_of_four_transcript):

    WHOLE_rolling_frame_of_four_transcripts = []
    PRERELEASE_rolling_frame_of_four_transcripts = []
    MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts = []
    ANALYST_SENTENCES_rolling_frame_of_four_transcripts = []


    for path in rolling_path_of_four_transcript:
        wholeTranscript = get_transcript(path)
        transcript_safe_harbour, transcript_questions = split_transcript(wholeTranscript)
        speaker_names = get_file_speaker_names(sector, stock)
        analyst_names = find_analyst_names(speaker_names, transcript_questions)
        not_current_analyst_names = []
        for names in speaker_names:
            if names not in analyst_names:
                not_current_analyst_names.append(names)
        analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

        wholeTranscript = deepCleanTranscript(wholeTranscript)
        transcript_safe_harbour = deepCleanTranscript(transcript_safe_harbour)
        management_sentences = deepCleanTranscript(management_sentences)
        analyst_sentences = deepCleanTranscript(analyst_sentences)

        WHOLE_rolling_frame_of_four_transcripts.append(wholeTranscript)
        PRERELEASE_rolling_frame_of_four_transcripts.append(transcript_safe_harbour)
        MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts.append(management_sentences)
        ANALYST_SENTENCES_rolling_frame_of_four_transcripts.append(analyst_sentences)

    feature_extract_52, feature_extract_53, feature_extract_54 = find_cosineSimilarity(WHOLE_rolling_frame_of_four_transcripts)

    feature_extract_55, feature_extract_56, feature_extract_57 = find_cosineSimilarity(PRERELEASE_rolling_frame_of_four_transcripts)

    feature_extract_58, feature_extract_59, feature_extract_60 = find_cosineSimilarity(MANAGEMENT_SENTENCES_rolling_frame_of_four_transcripts)

    feature_extract_61, feature_extract_62, feature_extract_63 = find_cosineSimilarity(ANALYST_SENTENCES_rolling_frame_of_four_transcripts)
    
    
    return [feature_extract_52, feature_extract_53, feature_extract_54, feature_extract_55, feature_extract_56, feature_extract_57, feature_extract_58, feature_extract_59, feature_extract_60, feature_extract_61, feature_extract_62, feature_extract_63]



In [326]:
fea_ext_list52to63 = getFeature52to63(sector, stock, rolling_path_of_four_transcript)
fea_ext_list52to63

[0.8676875542272092,
 0.7800564720521082,
 0.8547011379540762,
 0.8290223956651223,
 0.7629212459638359,
 0.8739211452099549,
 0.7175975419659856,
 0.6717259248110721,
 0.5988458021196891,
 0.708066648825725,
 0.6485686211309987,
 0.6129705069443805]

In [333]:
sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[0-9]**[0-9]*[1-4].*')
sector_files.sort(reverse=True)

for i in range (0, len(sector_files)):
    if i < len(sector_files)-3:
        rolling_path_of_four_transcript = sector_files[i:i+4]
        print(rolling_path_of_four_transcript)
        fea_ext_list52to63 = getFeature52to63(sector, stock, rolling_path_of_four_transcript)
        print(fea_ext_list52to63)
        print("________________________________")
    else:
        print(sector_files[i])

['sectors/tech/AAPL/AAPL20224.csv', 'sectors/tech/AAPL/AAPL20223.csv', 'sectors/tech/AAPL/AAPL20222.csv', 'sectors/tech/AAPL/AAPL20221.csv']
[0.8676875542272092, 0.7800564720521082, 0.8547011379540762, 0.8290223956651223, 0.7629212459638359, 0.8739211452099549, 0.7175975419659856, 0.6717259248110721, 0.5988458021196891, 0.708066648825725, 0.6485686211309987, 0.6129705069443805]
________________________________
['sectors/tech/AAPL/AAPL20223.csv', 'sectors/tech/AAPL/AAPL20222.csv', 'sectors/tech/AAPL/AAPL20221.csv', 'sectors/tech/AAPL/AAPL20214.csv']
[0.7987917172324863, 0.8077172161794282, 0.828151156869659, 0.7271608395589225, 0.7562547602919709, 0.7681629987266106, 0.6991804842084378, 0.5957268639524091, 0.5764435021726461, 0.7551726786331617, 0.7309707693365637, 0.7405165405523042]
________________________________
['sectors/tech/AAPL/AAPL20222.csv', 'sectors/tech/AAPL/AAPL20221.csv', 'sectors/tech/AAPL/AAPL20214.csv', 'sectors/tech/AAPL/AAPL20213.csv']
[0.8905895861256107, 0.82014492

In [411]:
sector = "tech"
stock = "AAPL"

path = "sectors/tech/AAPL/AAPL20223.csv"

mytranscript = get_transcript(path)
transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
speaker_names = get_file_speaker_names(sector, stock)
analyst_names = find_analyst_names(speaker_names, transcript_questions)

not_current_analyst_names = []

for names in speaker_names:
    if names not in analyst_names:
        not_current_analyst_names.append(names)
        
analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

fea_ext_list5to22 = getFeature5to22(transcript_safe_harbour, speaker_names)
print(fea_ext_list5to22)

fea_ext_list23to43 = getFeature23to43(analyst_sentences, management_sentences, speaker_names)
print(fea_ext_list23to43)
# 
fea_ext_list44to51 = getFeature44to51(mytranscript, speaker_names)
print(fea_ext_list44to51)




[0.41466305484163, 0.5106382978723404, 0.09219858156028368, 59.33, 0.12225925260119969, 0.4444444444444444, 0.3333333333333333, 51.89, 0.3333331346511841, 0.3333333333333333, 0.0, 59.94, 0.4369547136070192, 0.5193798449612403, 0.07751937984496124, 52.8, 0.06382978723404255, 0.02127659574468085]
[0.0009843106288833922, 0.23107569721115537, 0.23904382470119523, 66.62785714285714, -0.06452229051362901, 0.11904761904761904, 0.20238095238095238, 62.66190476190476, 0.033933738748470464, 0.2874251497005988, 0.25748502994011974, 70.59380952380953, -0.19529345035552978, 0.2, 0.4, -0.1378350095315413, 0.09090909090909091, 0.2727272727272727, 0.06257571218764946, 0.3082191780821918, 0.2465753424657534]
[-0.2020951271057129, 0.5, 0.47033958710156953, 0.9999999403953552, 0.5244149598810408, -0.7425544410943985, -0.9999817609786987, 0]


In [None]:
[0.4637751762950143, 0.5581395348837209, 0.08139534883720931, 61.514, -0.22222865952385795, 0.1111111111111111, 0.3333333333333333, 47.89, 0, 0.0, 0.0, 56.76, 0.507884399120852, 0.5900621118012422, 0.06832298136645963, 60.35, 0.05232558139534884, 0.011627906976744186]
[-0.032131856614416775, 0.07692307692307693, 0.10989010989010989, 83.25190476190475, -0.032131856614416775, 0.07692307692307693, 0.10989010989010989, 83.25190476190475, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0.16023576259613037, 0.0020640691121419272, 0.4050818212104566, 0.9999967217445374, 0.7698050829080435, 6.192922592163086e-05, 0.07372658069317158, 0]

[0.41466305484163, 0.5106382978723404, 0.09219858156028368, 59.33, 0.12225925260119969, 0.4444444444444444, 0.3333333333333333, 51.89, 0.3333331346511841, 0.3333333333333333, 0.0, 59.94, 0.4369547136070192, 0.5193798449612403, 0.07751937984496124, 52.8, 0.06382978723404255, 0.02127659574468085]
[-0.06452229051362901, 0.11904761904761904, 0.20238095238095238, 62.66190476190476, -0.06452229051362901, 0.11904761904761904, 0.20238095238095238, 62.66190476190476, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[-0.2020951271057129, 0.5, 0.47033958710156953, 0.9999999403953552, 0.5244149598810408, -0.7425544410943985, -0.9999817609786987, 0]

In [344]:
sector = "tech"
stock = "AAPL"

sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[0-9]**[0-9]*[0-4].*')
sector_files.sort(reverse=True)
for i in range (0, len(sector_files)): # for every .csv path of that stock
    path = sector_files[i]
    mytranscript = get_transcript(path)
    transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
    speaker_names = get_file_speaker_names(sector, stock)
    analyst_names = find_analyst_names(speaker_names, transcript_questions)

    not_current_analyst_names = []

    for names in speaker_names:
        if names not in analyst_names:
            not_current_analyst_names.append(names)
            
    analyst_sentences, management_sentences = get_analyst_management_sentences(analyst_names, transcript_questions)

    fea_ext_list5to22 = getFeature5to22(transcript_safe_harbour, speaker_names)
    print(fea_ext_list5to22)

    fea_ext_list23to43 = getFeature23to43(analyst_sentences, management_sentences, speaker_names)
    print(fea_ext_list23to43)

    fea_ext_list44to51 = getFeature44to51(mytranscript, speaker_names)
    print(fea_ext_list44to51)

    if i < len(sector_files)-3:
        rolling_path_of_four_transcript = sector_files[i:i+4]
        fea_ext_list52to63 = getFeature52to63(sector, stock, rolling_path_of_four_transcript)
    else:
        fea_ext_list52to63 = [None]*12
        
    print(fea_ext_list52to63)

    break


[0.4637751762950143, 0.5581395348837209, 0.08139534883720931, 61.514, -0.22222865952385795, 0.1111111111111111, 0.3333333333333333, 47.89, 0, 0.0, 0.0, 56.76, 0.507884399120852, 0.5900621118012422, 0.06832298136645963, 60.35, 0.05232558139534884, 0.011627906976744186]
[-0.032131856614416775, 0.07692307692307693, 0.10989010989010989, 83.25190476190475, -0.032131856614416775, 0.07692307692307693, 0.10989010989010989, 83.25190476190475, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0.16023576259613037, 0.0020640691121419272, 0.4050818212104566, 0.9999967217445374, 0.7698050829080435, 6.192922592163086e-05, 0.07372658069317158, 0]
[0.8676875542272092, 0.7800564720521082, 0.8547011379540762, 0.8290223956651223, 0.7629212459638359, 0.8739211452099549, 0.7175975419659856, 0.6717259248110721, 0.5988458021196891, 0.708066648825725, 0.6485686211309987, 0.6129705069443805]


In [340]:
[None]*12

[None, None, None, None, None, None, None, None, None, None, None, None]

In [None]:
# List of .csv file info for each stock:
# ------------------------------------------------------------------------------------------------
# META DATA
# ------------------------------------------------------------------------------------------------
#
# 0. Year of transcript release
# 1. Quarter of transcript release
# 2. Date of transcript release
# 3. Earnings Transcript contents
#
# ------------------------------------------------------------------------------------------------
# Feature Extractions:
# ------------------------------------------------------------------------------------------------
#
# 4. EPS surprise value
#
# ------------------------------------------------------------------------------------------------
# Transcript Features:
# ------------------------------------------------------------------------------------------------
#
# Pre release:
# 5. Whole pre-release - net sentiment
# 6. Whole pre-release - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 7. Whole pre-release - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 8. Whole pre-release - net word complexity
#
#
# 9. Specific foward looking statment - sentiment
# 10. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 11. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 12. Specific foward looking statment - word complexity
#
# 13. Non Specific Forward looking statement - sentiment 
# 14. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 15. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 16. Non Specific Forward looking statement - word complexity
#
# 17. Not Foward looking statement - sentiment
# 18. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 19. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 20. Not Foward looking statement - word complexity
#
# 21: #of_specific/#of_non_specific+#of_not_fls+#of_specific
# 22: #of_non_specific/#of_non_specific+#of_not_fls+#of_specific
#
# ------------------------------------------------------------------------------------------------
#
# Questions & Answers:
# 23. Whole Q&A - net sentiment
# 24. Whole Q&A – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 25. Whole Q&A – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 26. Whole Q&A - net word complexity
# 
# 27. all question (aggregate) - sentiment
# 28. all question (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 29. all question (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 30. all question (aggregate) - word complex
#
# 31. all reply (aggregate) - sentiment
# 32. all reply (aggregate) – #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 33. all reply (aggregate) – #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 34. all reply (aggregate) - word complex
#
# For all replies (aggregate):
# 35. Specific foward looking statment - sentiment
# 36. Specific foward looking statment - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 37. Specific foward looking statment - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 38. Specific foward looking statment - word complexity
#
# 39. Non Specific Forward looking statement - sentiment 
# 40. Non Specific Forward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 41. Non Specific Forward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 42. Non Specific Forward looking statement - word complexity
#
# 43. Not Foward looking statement - sentiment
# 44. Not Foward looking statement - #of_positive(sentiment)/#of_negative+positive+neutral(sentiment)
# 45. Not Foward looking statement - #of_negative(sentiment)/#of_negative+positive+neutral(sentiment)
# 46. Not Foward looking statement - word complexity
#
# ------------------------------------------------------------------------------------------------
#
# With specific words analysis:
# Sentences that includes the word:
# all of these words can be plural (e.g. cost and costs)
# 44: "margin" - average sentiment
# 45: "cost" - average sentiment
# 46: "revenue" - average sentiment
# 47: "earnings or EBIDTA" - average sentiment
# 48: "growth" - average sentiment
# 49: "leverage or debt" -  average sentiment
# 50: "price" – average sentiment
# 51: "operation" - average sentiment 
# ------------------------------------------------------------------------------------------------
#
# 52: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 2nd transcript
# 53: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 3rd transcript
# 54: Rolling frame of Whole Transcripts - Cosine of TFIDF between 1st and 4th transcript
# 55: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 2nd transcript
# 56: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 3rd transcript
# 57: Rolling frame of Pre-releases - Cosine of TFIDF between 1st and 4th transcript
# 58: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 59: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 60: Rolling frame of Management Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
# 61: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 2nd transcript
# 62: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 3rd transcript
# 63: Rolling frame of Analyst Sentences (Their Replies) - Cosine of TFIDF between 1st and 4th transcript
#
# ------------------------------------------------------------------------------------------------
#
# 64: Stock price difference between Day 0 (Day earnings call is released) and Day 10
# 65: Stock price difference between Day 0 (Day earnings call is released) and Day 20
# 66: Stock price difference between Day 0 (Day earnings call is released) and Day 30
# 67: Stock price difference between Day 0 (Day earnings call is released) and Day 40
# 68: Stock price difference between Day 0 (Day earnings call is released) and Day 50
# 69: Stock price difference between Day 0 (Day earnings call is released) and Day 60
# 70: Stock price difference between Day 0 (Day earnings call is released) and Day 70
# 66: Stock price difference between Day 0 (Day earnings call is released) and Day 80
# 67: Stock price difference between Day 0 (Day earnings call is released) and Day 90
#
# 68: Stock price difference between Day 1 (Day earnings call is released) and Day 10
# 69: Stock price difference between Day 1 (Day earnings call is released) and Day 20
# 70: Stock price difference between Day 1 (Day earnings call is released) and Day 30
# 71: Stock price difference between Day 1 (Day earnings call is released) and Day 40
# 72: Stock price difference between Day 1 (Day earnings call is released) and Day 50
# 73: Stock price difference between Day 1 (Day earnings call is released) and Day 60
# 74: Stock price difference between Day 1 (Day earnings call is released) and Day 70
# 75: Stock price difference between Day 1 (Day earnings call is released) and Day 80
# 76: Stock price difference between Day 1 (Day earnings call is released) and Day 90


In [None]:
delete operator name

In [None]:
# https://www.spglobal.com/marketintelligence/en/news-insights/blog/analyzing-sentiment-in-quarterly-earnings-calls-q2-2022


# https://www.amenityanalytics.com/case-studies/earnings-call-transcript-analysis




#TF-IDF ----> from management sentences (Replies + pre-release)