In [1]:
import re
import copy
import pickle
import requests
import pandas as pd
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
BASE_URL = 'https://www.fool.com'
EARNINGS_BASE_URL = 'https://www.fool.com/earnings-call-transcripts/?page='

In [3]:
# Return source code for every page in the range specified
def get_pages(pages):
    pbar = tqdm(total=pages, ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    page_list = []
    for page_ in range(1, pages+1):
        pbar.update(1)
        pbar.set_description("Processing page %s" % page_)
        transcripts_response = requests.get(f'{EARNINGS_BASE_URL}{page_}')
        if transcripts_response.status_code==403:
            print(403)
            break
        transcripts_response_bs = BeautifulSoup(transcripts_response.content, 'html.parser')
        transcripts_ = transcripts_response_bs.find("div", {"class":"list-content", "id":f"page-{page_}"})
        page_list.append(transcripts_)
    return page_list

In [4]:
pages = get_pages(30)
print(f'{len(pages)} pages of transcripts')

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30.0), HTML(value='')), layout=Layout(dis…

30 pages of transcripts


In [5]:
# Returnd ticker
def get_ticker_title(heading_title):
    matched_ticker = re.findall('\(([A-Z]*)\)', heading_title)
    if len(matched_ticker) != 1:
        return None
    return matched_ticker[0]

In [6]:
def get_company_name(heading_title, ticker_):
    matched_company = re.findall(f'(.+)\s\({ticker_}', heading_title)
    if matched_company:
        return matched_company[0]
    else:
        return None

In [7]:
def get_fiscal_period(heading_):
    fiscal_period = re.findall('Q\d\s20\d{2}', heading_)
    if fiscal_period:
        return fiscal_period[0]
    else:
        return None

In [8]:
def get_period_ending(heading_):
    try:
        period_ = heading_.article.find("div", {'class':'text'}).p.get_text()
        period_ending = re.findall('period ending (.*)?', period_)
    except:
        period_ending = None
    return period_ending[0]

In [9]:
def get_transcript_date(heading_):
    try:
        footer_ = heading_.article.find("div", {'class':'text'}).div.get_text()
        transcript_date = footer_[footer_.index("|")+2:]
    except:
        transcript_date = None
    return transcript_date

In [10]:
def get_transcript_list(pages_source):
    pbar_page = tqdm(total=len(pages_source), ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    transcript_list = []
    for page_num, page_source in enumerate(pages_source, 1):
        pbar_page.update(1)
        pbar_page.set_description("Processing page %s" % page_num)
        for heading_num, heading_ in enumerate(page_source.find_all('a')):
            heading_title = heading_["title"]
            ticker_ = get_ticker_title(heading_title)
            company_name = get_company_name(heading_title, ticker_)
            fiscal_period = get_fiscal_period(heading_title)
            period_ending = get_period_ending(heading_)
            transcript_date_ = get_transcript_date(heading_)
            heading_info = [ticker_, company_name, fiscal_period, period_ending, transcript_date_, heading_["href"]]
            transcript_list.append(heading_info)
    return transcript_list

In [12]:
transcript_list = get_transcript_list(pages)
print(f'{len(transcript_list)} transcripts')
transcript_list[0]

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=30.0), HTML(value='')), layout=Layout(dis…

600 transcripts


['NG',
 'Novagold Resources Inc',
 'Q2 2020',
 'May 31, 2020.',
 'Jun 25, 2020',
 '/earnings/call-transcripts/2020/06/25/novagold-resources-inc-ng-q2-2020-earnings-call-tr.aspx']

In [13]:
def get_transcript_sources(list_):
    pbar = tqdm(total=len(list_), ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    transcripts_ = []
    for ticker_, transcript_link in list_:
        pbar.update(1)
        pbar.set_description("Processing company %s" % ticker_)
        transcript_response = requests.get(f'{BASE_URL}{transcript_link}')
        if transcript_response.status_code==403:
            print("403")
            break
        transcript_response_bs = BeautifulSoup(transcript_response.content, 'html.parser')
        article_ = transcript_response_bs.find("span", {"class":"article-content"})
        transcripts_.append(article_)
    return transcripts_

In [15]:
transcripts = get_transcript_sources([[transcript_[0], transcript_[5]] for transcript_ in transcript_list])
print(f'{len(transcripts)} transcripts processed')

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=600.0), HTML(value='')), layout=Layout(di…

600 transcripts processed


In [16]:
def get_announcement_time(transcript_list, transcripts):
    upload_times = []
    for transcript_info, transcript in zip(transcript_list, transcripts):
        all_text = transcript.get_text()
        upload_date = transcript_info[4]
        try:
            upload_time = re.findall(f'{upload_date}' + ',\s?([0-9]{1,2}:[0-9]{2}\s[a|p]\.m\.)', all_text)[0]
        except:
            upload_time = None
        upload_times.append(upload_time)
    return upload_times

In [17]:
def get_corpus_text(transcripts_):
    corpuses = []
    for ticker_, transcript_ in transcripts_:
        first_speaker = transcript_.find(string=' -- ')
        if not first_speaker:
            print("No first speaker", ticker_)
            corpuses.append(None)
            continue
        all_para = first_speaker.find_parent().find_next_siblings('p')
        complete_transcript = ' '.join([para.get_text() for para in all_para])
        try:
            complete_transcript = complete_transcript[:complete_transcript.index(" Operator ")]
            corpuses.append(complete_transcript)
        except ValueError:
            try:
                complete_transcript = complete_transcript[:complete_transcript.lower().index("[operator instructions] ")]
                corpuses.append(complete_transcript)
            except ValueError:
                print(f"Check {ticker_}")
                corpuses.append(complete_transcript)
    return corpuses

In [18]:
announcement_times = get_announcement_time(transcript_list, transcripts)
announcement_times[0]

'11:00 a.m.'

In [28]:
def get_transcript(transcript_list, transcripts):
    pbar = tqdm(total=len(transcript_list), ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    corpuses = []
    for transcript_info, transcript in zip(transcript_list, transcripts):
        pbar.update(1)
        pbar.set_description("Extracting transcript %s" % transcript_info[0])
        first_speaker = transcript.find(string=' -- ')
        if not first_speaker:
            corpuses.append(None)
            continue
        entire_corpus = first_speaker.find_parent().find_next_siblings()
        corpuses.append(entire_corpus)
    return corpuses

In [30]:
def get_corpus(corpuses):
    pbar = tqdm(total=len(corpuses), ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    pre_qas = []
    for ind, corpus in enumerate(corpuses):
        pbar.update(1)
        pbar.set_description("Extracting prepared remarks #%s" % ind)
        if corpus:
            cleaned_corpus = " ".join([para.get_text() for para in corpus]).replace("\n", " ").replace("&", "and")
            try:
                cleaned_corpus_pre_qa = cleaned_corpus[:cleaned_corpus.index("Questions and Answers")].strip()
            except:
                cleaned_corpus_pre_qa = None
            pre_qas.append(cleaned_corpus_pre_qa)
        else:
            pre_qas.append(None)
    return pre_qas

In [32]:
def get_qa(corpuses):
    pbar = tqdm(total=len(corpuses), ncols=1000, bar_format='{l_bar}{bar}{n_fmt}/{total_fmt}')
    post_qas = []
    for ind, corpus in enumerate(corpuses):
        pbar.update(1)
        pbar.set_description("Extracting Q&A #%s" % ind)
        if corpus:
            qa_reg = re.compile(".*Questions (and|&) Answers.*")
            try:
                index_qa = [ind for ind, para in enumerate(corpus) if qa_reg.search(para.get_text())][0]
                corpus_post_qa = corpus[index_qa:]
                cleaned_corpus_post_qa = [para.get_text() for para in corpus_post_qa]
                cleaned_corpus_post_qa = remove_analyst_comments(cleaned_corpus_post_qa)
                cleaned_corpus_post_qa = remove_operator_comments(cleaned_corpus_post_qa)
                cleaned_corpus_post_qa = remove_misc(cleaned_corpus_post_qa)
                key_people, cleaned_corpus_post_qa = get_key_people(cleaned_corpus_post_qa)
            except:
                cleaned_corpus_post_qa = None
        else:
            cleaned_corpus_post_qa = None
        post_qas.append([key_people, cleaned_corpus_post_qa])
    return post_qas

In [33]:
def remove_analyst_comments(cleaned_corpus_post_qa):
    analyst_present = re.compile(".*--\sAnalyst")
    cleaned_corpus_check_analyst = [analyst_present.search(para) for para in cleaned_corpus_post_qa]
    tracker_ = []
    skip_ = False
    for ind_, analyst_check in enumerate(cleaned_corpus_check_analyst):
        if skip_:
            tracker_.append(True)
            skip_ = False
            continue
        else:
            if analyst_check:
                tracker_.append(True)
                skip_ = True
            else:
                tracker_.append(False)
    corpus_no_analyst = [line_ for check_, line_ in zip(tracker_, cleaned_corpus_post_qa) if not check_]
    return corpus_no_analyst

In [34]:
def remove_operator_comments(cleaned_corpus_post_qa):
    operator_present = re.compile("Operator")
    cleaned_corpus_check_operator = [operator_present.search(para) for para in cleaned_corpus_post_qa]
    tracker_ = []
    skip_ = False
    for ind_, operator_check in enumerate(cleaned_corpus_check_operator):
        if skip_:
            tracker_.append(True)
            skip_ = False
            continue
        else:
            if operator_check:
                tracker_.append(True)
                skip_ = True
            else:
                tracker_.append(False)
    corpus_no_operator = [line_ for check_, line_ in zip(tracker_, cleaned_corpus_post_qa) if not check_]
    return corpus_no_operator

In [35]:
def remove_misc(cleaned_corpus_post_qa):
    duration_compile = re.compile(".*[D|d]uration:.*minute.*")
    call_participants_compile = re.compile(".*[C|c]all participants.*")
    more_ticker_compile = re.compile("More\s?[A-Z]{1,4}\s?analysis")
    all_earnings_compile = re.compile(".*[A|a]ll earnings call transcripts?.*")
    new_lines_compile = re.compile("(\n)+")
    tracker_ = []
    for line_ in cleaned_corpus_post_qa:
        duration_check = duration_compile.search(line_)
        call_participants_check = call_participants_compile.search(line_)
        more_ticker_check = more_ticker_compile.search(line_)
        all_earnings_check = all_earnings_compile.search(line_)
        new_lines_check = new_lines_compile.search(line_)
        if line_=="":
            tracker_.append(True)
        elif line_.lower()=='questions and answers:' or line_.lower()=='questions & answers:':
            tracker_.append(True)
        elif duration_check:
            tracker_.append(True)
        elif call_participants_check:
            tracker_.append(True)
        elif more_ticker_check:
            tracker_.append(True)
        elif all_earnings_check:
            tracker_.append(True)
        elif new_lines_check:
            tracker_.append(True)
        else:
            tracker_.append(False)
    corpus_no_misc = [line_ for check_, line_ in zip(tracker_, cleaned_corpus_post_qa) if not check_]
    return corpus_no_misc

In [36]:
def get_key_people(cleaned_corpus_post_qa):
    person_title = re.compile("\s--\s[A-Z]")
    key_people = []
    tracker_ = []
    for line_ in cleaned_corpus_post_qa:
        person_title_check = person_title.search(line_)
        if person_title_check and len(line_)<100:
            key_people.append(line_)
            tracker_.append(True)
        else:
            tracker_.append(False)
    key_people = list(set(key_people))
    corpus_key_people = [line_ for check_, line_ in zip(tracker_, cleaned_corpus_post_qa) if not check_]
    return key_people, corpus_key_people

In [29]:
corpuses = get_transcript(transcript_list, transcripts)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=600.0), HTML(value='')), layout=Layout(di…

In [31]:
pre_qa = get_corpus(corpuses)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=600.0), HTML(value='')), layout=Layout(di…

In [37]:
people_post_qas = get_qa(corpuses)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=600.0), HTML(value='')), layout=Layout(di…

In [39]:
# Combining all the information
final_list = copy.deepcopy(transcript_list)
_ = [final_list[ind].append(post_key_people) for ind, (post_key_people, post_) in enumerate(people_post_qas)]
_ = [final_list[ind].append(pre_) for ind, pre_ in enumerate(pre_qa)]
_ = [final_list[ind].append(post_) for ind, (post_key_people, post_) in enumerate(people_post_qas)]

In [44]:
#final_df = pd.DataFrame(final_list, columns=['Ticker', 'Company', 'Fiscal_Date', 'Period_Ending', 'Transcript_Date', 'Transcript_Link',
#                                             'Key_People', 'Prepared_Remarks', 'QandA'])
display(final_df)
final_df.info()

Unnamed: 0,Ticker,Company,Fiscal_Date,Period_Ending,Transcript_Date,Transcript_Link,Key_People,Prepared_Remarks,QandA
0,NG,Novagold Resources Inc,Q2 2020,"May 31, 2020.","Jun 25, 2020",/earnings/call-transcripts/2020/06/25/novagold...,[Gregory A. Lang -- President & Chief Executiv...,"Thank you, Gillie, and good morning everyone. ...","[So, when you take into account proven and pro..."
1,SNX,SYNNEX Corporation,Q2 2020,"May 31, 2020.","Jun 25, 2020",/earnings/call-transcripts/2020/06/25/synnex-c...,"[Marshall Witt -- Chief Financial Officer, Den...","Thank you, Chantal, and good afternoon everyon...","[Hi Vince. Thanks for the question. Yeah, for ..."
2,PRGS,Progress Software,Q2 2020,"March 31, 2020.","Jun 25, 2020",/earnings/call-transcripts/2020/06/25/progress...,[Brian Flanagan -- Vice President of Investor ...,"Thank you, Nadia. Good afternoon, everyone, an...","[Thanks, Steve. I hope you're well, too., So S..."
3,NKE,Nike Inc,Q4 2020,"May 31, 2020.","Jun 25, 2020",/earnings/call-transcripts/2020/06/25/nike-inc...,"[Andy Muir -- Vice President, Investor Relatio...","Thank you, operator. Hello, everyone, and than...","[Sure, Alexandra. In a funny way, I would char..."
4,CAMP,CalAmp Corp,Q1 2021,"May 31, 2020.","Jun 25, 2020",/earnings/call-transcripts/2020/06/25/calamp-c...,[Kurt Binder -- Executive Vice President and C...,Good afternoon and welcome to CalAmp's fiscal ...,"[Thanks, Mike., Yeah, Mike. So, as you pointed..."
...,...,...,...,...,...,...,...,...,...
595,TRTX,"TPG RE Finance Trust, Inc.",Q1 2020,"March 31, 2020.","May 12, 2020",/earnings/call-transcripts/2020/05/12/tpg-re-f...,[Bob Foley -- Chief Financial and Risk Officer...,"Thank you, Deborah. Good morning, everyone. I ...","[19, the multifamily in Houston, if I have the..."
596,CELH,Celsius Holdings Inc.,Q1 2020,"March 31, 2020.","May 12, 2020",/earnings/call-transcripts/2020/05/12/celsius-...,[Bob Foley -- Chief Financial and Risk Officer...,,
597,AEE,Ameren Corp,Q1 2020,"March 31, 2020.","May 12, 2020",/earnings/call-transcripts/2020/05/12/ameren-c...,"[Warner L. Baxter -- Chairman, President and C...","Thank you, and good morning. On the call with ...","[Yeah. Thank you, Julien. Hope you're doing we..."
598,IR,Ingersoll-Rand PLC,Q1 2020,"March 31, 2020.","May 12, 2020",/earnings/call-transcripts/2020/05/12/ingersol...,[Emily Weaver -- Senior Vice President and Chi...,Thank you and welcome to the Ingersoll-Rand 20...,"[Good morning, Andy. Good, and you?, Yeah, And..."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Ticker            597 non-null    object
 1   Company           597 non-null    object
 2   Fiscal_Date       600 non-null    object
 3   Period_Ending     600 non-null    object
 4   Transcript_Date   600 non-null    object
 5   Transcript_Link   600 non-null    object
 6   Key_People        600 non-null    object
 7   Prepared_Remarks  583 non-null    object
 8   QandA             583 non-null    object
dtypes: object(9)
memory usage: 42.3+ KB
