In [1]:
import requests
import json
from datetime import datetime
import time
import re
from bs4 import BeautifulSoup

In [7]:
symbol = 'goog'
start_unix = 1415866606 # the unix format of start date time, i.e. Thu Nov 13 2014 08:16:46 GMT+0000
today_unix = int(datetime.now().timestamp())
size = 20 # number of item returned each time
rapid_api_key = "0bb1b966aemsh169f91a4d3a2f09p169928jsn263ac1b2f1b3"
list_url = "https://seeking-alpha.p.rapidapi.com/transcripts/v2/list"
transcript_url = "https://seeking-alpha.p.rapidapi.com/transcripts/v2/get-details"

In [8]:
def request_transcript_list(symbol, size, end_unix=None):
	querystring = {"id": symbol, "until": end_unix, "size": size, "number":"1"}
	
	headers = {
	"x-rapidapi-key": rapid_api_key,
	"x-rapidapi-host": "seeking-alpha.p.rapidapi.com"
	}
	response = requests.get(list_url, headers=headers, params=querystring)
	response = json.loads(response.text)
	return response

In [9]:
def get_time_id_title(response, start_unix=start_unix, type='transcript', transcripts = {}):
	res = response['data']
	for i in res:
		time = datetime.fromisoformat(i['attributes']['publishOn'])
		time_unix = int(time.timestamp())
		if (i['type'] == type) and (time_unix >= start_unix):
			transcripts[i['id']] = i['attributes']['title']
	end_string = res[-1]['attributes']['publishOn'] # publish date time
	end_object = datetime.fromisoformat(end_string)
	end_unix = int(end_object.timestamp())
	return end_unix, transcripts

In [10]:
def get_past_10_year_transcripts_id(symbol, size, today_unix, start_unix=start_unix):
	end_unix = today_unix
	transcripts = {}
	while end_unix >= start_unix:
		response = request_transcript_list(symbol, size, end_unix=end_unix)
		length = len(response['data'])
		end_unix, transcripts = get_time_id_title(response, type='transcript', transcripts=transcripts)
		
		if length < size: # all transcripts have been retrieved, i.e. the company went public after Nov 13 2014 
			break 
	return transcripts

In [11]:
transcripts = get_past_10_year_transcripts_id(symbol, size, today_unix, start_unix=start_unix)

In [12]:
transcripts

{'4730692': 'Alphabet Inc. (GOOG) Q3 2024 Earnings Call Transcript',
 '4720309': 'Alphabet Inc. (GOOG) Goldman Sachs 2024 Communacopia and Technology Conference (Transcript)',
 '4705978': 'Alphabet Inc. (GOOG) Q2 2024 Earnings Call Transcript',
 '4686483': 'Alphabet Inc. (GOOG) Q1 2024 Earnings Call Transcript',
 '4666190': 'Alphabet Inc. (GOOG) Q4 2023 Earnings Call Transcript',
 '4656174': 'Alphabet Inc (GOOG) Presents at Scotiabank Inaugural Global Technology Conference (Transcript)',
 '4643107': 'Alphabet Inc. (GOOG) Q3 2023 Earnings Call Transcript',
 '4633812': 'Alphabet Inc. (GOOG) Presents at Goldman Sachs Communacopia & Technology Conference (Transcript)',
 '4619735': 'Alphabet Inc. (GOOG) Q2 2023 Earnings Call Transcript',
 '4605811': "Alphabet Inc. (GOOG) SVB MoffettNathanson's Inaugural Technology, Media, and Telecom Conference (Transcript)",
 '4596558': 'Alphabet Inc. (GOOG) Q1 2023 Earnings Call Transcript',
 '4585742': 'Alphabet Inc. (GOOG) Morgan Stanley Technology, Med

In [13]:
def screen(transcripts, symbol):
	earning_calls = {}
	for id, title in transcripts.items():
		low_tit = title.lower()
		pattern = r"q[1-4] 20\d{2}"
		match = bool(re.search(pattern, low_tit))
		if (symbol.lower() in low_tit) and ('transcript' in low_tit) and match:
			earning_calls[id] = title
	return earning_calls

In [14]:
earning_calls = screen(transcripts, 'goog')
earning_calls

{'4730692': 'Alphabet Inc. (GOOG) Q3 2024 Earnings Call Transcript',
 '4705978': 'Alphabet Inc. (GOOG) Q2 2024 Earnings Call Transcript',
 '4686483': 'Alphabet Inc. (GOOG) Q1 2024 Earnings Call Transcript',
 '4666190': 'Alphabet Inc. (GOOG) Q4 2023 Earnings Call Transcript',
 '4643107': 'Alphabet Inc. (GOOG) Q3 2023 Earnings Call Transcript',
 '4619735': 'Alphabet Inc. (GOOG) Q2 2023 Earnings Call Transcript',
 '4596558': 'Alphabet Inc. (GOOG) Q1 2023 Earnings Call Transcript',
 '4574870': 'Alphabet Inc. (GOOG) Q4 2022 Earnings Call Transcript',
 '4549115': 'Alphabet Inc. (GOOG) Q3 2022 Earnings Call Transcript',
 '4526113': 'Alphabet Inc. (GOOG) CEO Sundar Pichai on Q2 2022 Results - Earnings Call Transcript',
 '4503851': "Alphabet Inc.'s (GOOG) CEO Sundar Pichai on Q1 2022 Results - Earnings Call Transcript",
 '4483370': "Alphabet Inc.'s (GOOG) CEO Sundar Pichai on Q4 2021 Results - Earnings Call Transcript",
 '4441896': "Alphabet Inc.'s (GOOG) CEO Sundar Pichai on Q2 2021 Results - 

In [15]:
transcript_ids = [idx for idx in earning_calls.keys()]

In [16]:
def request_transcript_details(id):
	querystring = {"id":id}
	headers = {
		"x-rapidapi-key": rapid_api_key,
		"x-rapidapi-host": "seeking-alpha.p.rapidapi.com"
	}
	response = requests.get(transcript_url, headers=headers, params=querystring)
	response = json.loads(response.text)
	time.sleep(1)
	return response

In [17]:
def clean_content(content):
	soup = BeautifulSoup(content, "lxml")
	# Replace <br> tags with newline characters
	for br in soup.find_all("br"):
		br.replace_with("\n")
	plain_text = "\n\n".join([p.get_text() for p in soup.find_all("p")])
	return plain_text

In [18]:
def get_transcript(ids, symbol):
    dataset = []
    with open(symbol + '_calls.jsonl', 'w') as f: 
        for id in ids:
            response = request_transcript_details(id)
            content = response['data']['attributes']['content']
            content = clean_content(content)  
            title = response['data']['attributes']['title']
            time = datetime.fromisoformat(response['data']['attributes']['publishOn']).strftime("%Y-%m-%d")
            transcript = {"time": time, "title": title, "content": content}
            
            dataset.append(transcript)
            json_line = json.dumps(transcript) 
            f.write(json_line + "\n")
    return dataset

In [19]:
dataset = get_transcript(transcript_ids, symbol)

In [None]:
dataset