In [1]:
from pymongo import MongoClient



def connect_database(db_name, port = 27017):

    # URI for database
    URI = f'mongodb://localhost:{port}'

    try:
        
        # Create client
        client = MongoClient(URI)

        # Connect to database
        database = client[db_name]
        print("Successfully connected to database")
        

    except Exception as e:
        
        database = None
        # Handle database connection error
        print(f"{type(e)}: Failed connecting to {URI}\nError message: {e}")
        
    return database

In [2]:
from datetime import datetime as dt



def get_datetime():
    # Get date/time 
    today = dt.today() # today
    date = today.date().strftime('%D') # todays date
    time = today.time().strftime("%H:%M") # current time


    datetime = f'{date}_{time}'

    return datetime

In [3]:
import os


# Get non-hidden directories
sectors = [directory 
           for directory in os.listdir('transcripts')
           if '.' not in directory
           and 'popular' not in directory]


for sector in sectors:
    
    sector_transcripts = os.listdir(f"transcripts/{sector}")
    print(f"{sector.upper()} SECTOR: \n{sector_transcripts}\n")
    

FINANCIAL SECTOR: 
['PWCDF_q3_2020.txt', 'AGGZF_q3_2020.txt', 'KBCSF_q3_2020.txt', 'HMCBF_q3_2020.txt', 'VEL_q3_2020.txt', 'ITCB_q3_2020.txt', 'BANX_q3_2020.txt', 'LSGOF_q2_2021.txt', 'PLMR_q3_2020.txt', 'CWYUF_q3_2020.txt', 'WPTIF_q3_2020.txt', 'CIB_q3_2020.txt', 'ABNRY_q3_2020.txt', 'LWSCF_q3_2020.txt', 'BAM_q3_2020.txt', 'MFC_q3_2020.txt', 'LMB_q3_2020.txt', 'BSRTF_q3_2020.txt', 'MMAC_q3_2020.txt', 'ZURVY_q3_2020.txt', 'GLAD_q4_2020.txt']

TECHNOLOGY SECTOR: 
['RMED_q4_2020.txt', 'OSS_q3_2020.txt', 'CSCO_q1_2021.txt', 'JAMF_q3_2020.txt', 'DLB_q4_2020.txt', 'DGII_q4_2020.txt', 'AMAT_q4_2020.txt', 'SWIR_q3_2020.txt', 'INTZ_q3_2020.txt', 'INPX_q3_2020.txt', 'LMPX_q3_2020.txt']

HEALTHCARE SECTOR: 
['NBY_q3_2020.txt', 'CTEK_q3_2020.txt', 'OPNT_q3_2020.txt', 'BIOL_q3_2020.txt', 'MOTS_q3_2020.txt', 'AWH_q3_2020.txt', 'AGRX_q3_2020.txt', 'RMED_q4_2020.txt', 'NAVB_q3_2020.txt', 'CAPR_q3_2020.txt', 'NARI_q3_2020.txt', 'INMD_q3_2020.txt', 'ONTX_q3_2020.txt', 'HSDT_q3_2020.txt', 'ETON_q3_2020.

In [4]:

def get_earnings_call_dict(verbose=False):
    
    
    transcripts_by_sector = {}

    for sector in sectors:


        dirname = f"transcripts/{sector}"
        sector_transcripts = os.listdir(dirname)
        earnings_calls = []

        print(f"Writing data for {sector} sector:") if verbose else None

        for transcript in sector_transcripts:

            fname = f"{dirname}/{transcript}"
            ind = transcript.find('_')
            name = transcript[:ind]
            q = transcript[ind+1:-4]


            with open(fname,'r') as f:
                text = f.read()   
            
            earnings_call_obj = {
                "name":name, 
                "quarter":q,
                "transcript":text
            }

            earnings_calls.append(earnings_call_obj)

            print(f"${name}, {q} written") if verbose else None

        transcripts_by_sector[sector] = earnings_calls
        print(f"Finished {sector} data\n") if verbose else None

    print("Finished writing all transcripts")
    
    return transcripts_by_sector

In [5]:
sectors

['financial', 'technology', 'healthcare']

In [6]:

def save_to_database(data, db, time):
    
    
    for sector,earnings_call_objects in data.items():
        
        sector_collection = db[sector]
        sector_collection.insert_one({
            'sector': sector,
            'earnings_calls': earnings_call_objects,
            'time_added': time
        })
        
        
    print("Successfully saved to database")

In [7]:
DATABASE_NAME = 'earnings_call_transcripts'


earnings_call_dict = get_earnings_call_dict(verbose=True)
earning_call_db = connect_database(DATABASE_NAME)
datetime = get_datetime()


save_to_database(
    data = earnings_call_dict,
    db = earning_call_db, 
    time = datetime
)

Writing data for financial sector:
$PWCDF, q3_2020 written
$AGGZF, q3_2020 written
$KBCSF, q3_2020 written
$HMCBF, q3_2020 written
$VEL, q3_2020 written
$ITCB, q3_2020 written
$BANX, q3_2020 written
$LSGOF, q2_2021 written
$PLMR, q3_2020 written
$CWYUF, q3_2020 written
$WPTIF, q3_2020 written
$CIB, q3_2020 written
$ABNRY, q3_2020 written
$LWSCF, q3_2020 written
$BAM, q3_2020 written
$MFC, q3_2020 written
$LMB, q3_2020 written
$BSRTF, q3_2020 written
$MMAC, q3_2020 written
$ZURVY, q3_2020 written
$GLAD, q4_2020 written
Finished financial data

Writing data for technology sector:
$RMED, q4_2020 written
$OSS, q3_2020 written
$CSCO, q1_2021 written
$JAMF, q3_2020 written
$DLB, q4_2020 written
$DGII, q4_2020 written
$AMAT, q4_2020 written
$SWIR, q3_2020 written
$INTZ, q3_2020 written
$INPX, q3_2020 written
$LMPX, q3_2020 written
Finished technology data

Writing data for healthcare sector:
$NBY, q3_2020 written
$CTEK, q3_2020 written
$OPNT, q3_2020 written
$BIOL, q3_2020 written
$MOTS, q3_2