In [16]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import time
import numpy as np
from datetime import datetime
import yfinance as yf
import os
import glob
import regex as re
import csv

In [None]:
# Dividing the transcript into:
# 1. Pre-release & safe harbour
# 2. Q&A
# ----------------------------------------------------------------
# Divide Q&A further into:
# 3. Analysts Questions
# 4. Management Replies

In [58]:
def get_transcript(path):
    mytranscript = pd.read_csv(path).iloc[[2]].values[0][0] 
    mytranscript = re.sub(r'[^A-Za-z0-9.,:!\'\n ]', '', mytranscript)
    mytranscript = re.sub('[^\S\n]+', ' ', mytranscript) #replaces multiple spaces to single space, without deleting newlines \n in the process
    mytranscript = mytranscript.splitlines() # finds transcript
    return mytranscript

In [56]:
def split_transcript(mytranscript):    
    transcript_safe_harbour, transcript_questions = "", ""
    for i in range(0, len(mytranscript)):
        speech_bubble = mytranscript[i].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space so the IF condition below can run smoothly
        # finds the following condition (what operator says) and splits the transcript into 2)
        if (i > 2) and (("operator:" in speech_bubble) and (("question" in speech_bubble) or ("go ahead" in speech_bubble) or ("operator instructions" in speech_bubble))):
            transcript_safe_harbour = mytranscript[0:i]
            transcript_questions = mytranscript[i:]
            break
    return transcript_safe_harbour, transcript_questions

In [111]:
def get_file_speaker_names(sector, stock):
    write_path = "sectors/"+sector+"/"+stock+"/"+"speaker names.csv"
    speaker_names = np.loadtxt(write_path, delimiter=', ', dtype=str)
    return speaker_names


In [73]:
# PROBLEM: In the text below, the operator mentions Michael Rollinswtih, however, the speaker name is registered as Mike Rollins.
# Operator: Okay, thank you sir. And that will come from Michael Rollinswtih with Citi. Please go ahead.
# Mike Rollins: Thanks for taking the questions. Two, if I could. First, as we think about your move to consumption-based pricing and we also think about some of the changes in the sizes of the buckets for data, can you give us a sense of what's happening as you increase the data buckets, you try to encourage customers to spend more on data? Are you getting more in data revenue relative to those that might have started at a higher plan and can use the same amount, but moved to a cheaper bucket? Then the second question is if you just had an update on the regulatory process with the DIRECTV deal. I think the FCC put out an update on their clock today, and I am just curious if you could talk about the implications of that and any update from your perspective. Thanks

# Solution:
mytext = "Michael Rollinswtih"
mystr = "Mike Rollins"
mylist = mystr.split()
if (mylist[0] in mytext) or (mylist[-1] in mytext):
    print(mystr)

Mike Rollins


In [112]:
# finds a list of analyst names for a single .csv file
def find_analyst_names(speaker_names, transcript_questions):
    analyst_names = []
    # the programme recognises the question is being asked by an analyst when the following conditions are met:
    for index in range(0, len(transcript_questions)-2):
        speech_bubble = transcript_questions[index].lower()
        speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble) # regex: replaces all punctuations (except for ":") with 1 open space 
        if "operator:" in speech_bubble:
            for name in speaker_names:
                namelist = name.split()
                if name.lower() != "operator": 
                    for name_2 in namelist: # cycle through each name in the name_list
                        name_2 = name_2.lower()
                        # checks if the speaker name happens to be in the speech_bubble, if it is, then the person speaking is an analyst
                        # also len(name) > 2 is used to avoid the problem with single letters being registered as in the speech_bubble 
                        # (e.g. the letter "A" in the name "A Gayn Erickson" will be in the speech_bubble, but Gayn Erickson is not an analyst, so "A" is not counted)
                        if (name_2 in speech_bubble) and len(name_2) > 2:
                            analyst_names.append(name)
                    if "unidentified" in name.lower().split(): # finds name such as "Unidentified Analyst"
                        analyst_names.append(name) 
                        
    analyst_names = list(set(analyst_names)) # replaces duplicates        
    return analyst_names

In [45]:
transcript_questions

['Operator: Okay. Go ahead. And you can ask your question now.',
 'Jack Vander: Okay. Great. Jack Vander here speaking from analysts from Maxim Group. How are you doing today Appreciate the update. I will throw the question for Brett. Actually, a quick housekeeping question. Just a quick one. What do you guys expect to release the full financial statements in the 10Q',
 'Brett Moyer: Probably very soon within an hour, I guess. We might have to wait. We might be required to wait till close the market. Today regardless .',
 'Jack Vander: Yes. No worries there. Appreciate it there. Okay. Then Brent, maybe as for the revenue guide, it looks like we are now expecting sequential growth in the third quarter and then sequential growth in the fourth quarter. Just any additional color you can provide there on the topline and then maybe any additional thoughts an overall revenue potential for 2023',
 'Brett Moyer: Okay. So what we see and I think probably you have seen it reported out in other, s

In [126]:
sector = "semiconductors"
stock = "AEHR"
path = "sectors/semiconductors/AEHR/AEHR20231.csv"

analyst_names = []

mytranscript = get_transcript(path)
transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
speaker_names = get_file_speaker_names(sector, stock)
analyst_names = find_analyst_names(speaker_names, transcript_questions)
print("\033[3m\033[1m\033[4mAnalyst Speakers for AEHR20231:\033[0m")
print(analyst_names)


not_current_analyst_names = []

# the rest of the names on the speaker_list are either management, or past/future analysts (analysts that are not present in the selected .csv file)

for names in speaker_names:
    if names not in analyst_names:
        not_current_analyst_names.append(names)

print("\n")
print("\033[3m\033[1m\033[4mNot Current Analyst Speakers for AEHR20231:\033[0m")
print(not_current_analyst_names)


[3m[1m[4mAnalyst Speakers for AEHR20231:[0m
['Matt Winthrop ', 'Bradford Ferguson', 'Christian Schwab ', 'Larry Chlebina ', 'Gregory Ratliff', 'Gregory Wilbur', 'Unidentified Analyst', 'Jeffrey Scott with Scott Asset Management', 'Matt Winthrop', 'Christian Schwab', 'Larry Chlebina', 'Matthew Winthrop']


[3m[1m[4mNot Current Analyst Speakers for AEHR20231:[0m
['Company Representatives', 'Nehal Chokshi', 'Jeffrey Scott', 'John Fichthorn', 'Ken Spink', 'Orin Hirschman', 'Operator', 'Marilynn Meek', 'Lasse Glassen', 'Kenneth Spink', 'Joe Calabrese', 'Mark Gomes', 'Charlie Doe', 'John Barton', 'Ben Rabizadeh', 'A Gayn Erickson', 'Kevin Dede', 'Dominik Schmidt', 'Jim Byers ', 'Gayn Erickson ', 'Gary Larson', 'Gary L. Larson', 'Tom Diffely', 'Geoffrey Scott', 'Frank Barresi', 'Rhea Posedel', 'Lasse Larson', 'Dylan Patel', 'John Nelson', 'Mike Dooling', 'Jon Gruber ', 'Rhea J. Posedel', 'Jon Gruber', 'Geoff Scott', 'Jeffery Scott', 'William Smart', 'Todd Kehrli', 'Scott Eckstein', 'T

['Matt Winthrop ',
 'Bradford Ferguson',
 'Christian Schwab ',
 'Larry Chlebina ',
 'Gregory Ratliff',
 'Gregory Wilbur',
 'Unidentified Analyst',
 'Jeffrey Scott with Scott Asset Management',
 'Matt Winthrop',
 'Christian Schwab',
 'Larry Chlebina',
 'Matthew Winthrop']

In [None]:
sector = "semiconductors"
stock = "AEHR"

sector_files = glob.glob('sectors/'+str(sector)+'/'+str(stock)+'/'+str(stock)+'20*[1-9]**[1-9]*[1-4].*')
sector_files.sort(reverse=True)
for path in sector_files: # for every .csv path of that stock
    mytranscript = get_transcript(path)
    transcript_safe_harbour, transcript_questions = split_transcript(mytranscript)
    speaker_names = get_file_speaker_names(sector, stock)
    analyst_names = find_analyst_names(speaker_names, transcript_questions)
    break


In [9]:
speech_bubble = transcript_questions[index].lower()
speech_bubble = re.sub(r'[^\w\s:]', ' ', speech_bubble)
speech_bubble

NameError: name 'transcript_questions' is not defined

In [8]:
transcript_questions

NameError: name 'transcript_questions' is not defined

In [None]:
delete operator name