In [None]:
%%capture
%pip install yfinance
%pip install pysentiment2
%pip install transformers
%pip install py-readability-metrics
%python -m nltk.downloader punkt

In [2]:
import yfinance as yf
import pysentiment2 as ps
from bs4 import BeautifulSoup
import requests
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from datetime import datetime as dt
import spacy
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler
import torch
from readability import Readability


In [3]:
spy_list_requests = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = BeautifulSoup(spy_list_requests.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text
    tickers.append(ticker)

spy_tickers_list = [s.replace('\n', '') for s in tickers]

print(spy_tickers_list)

['MMM', 'AOS', 'ABT', 'ABBV', 'ABMD', 'ACN', 'ATVI', 'ADM', 'ADBE', 'ADP', 'AAP', 'AES', 'AFL', 'A', 'APD', 'AKAM', 'ALK', 'ALB', 'ARE', 'ALGN', 'ALLE', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN', 'AMCR', 'AMD', 'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC', 'AME', 'AMGN', 'APH', 'ADI', 'ANSS', 'AON', 'APA', 'AAPL', 'AMAT', 'APTV', 'ANET', 'AJG', 'AIZ', 'T', 'ATO', 'ADSK', 'AZO', 'AVB', 'AVY', 'BKR', 'BALL', 'BAC', 'BBWI', 'BAX', 'BDX', 'WRB', 'BRK.B', 'BBY', 'BIO', 'TECH', 'BIIB', 'BLK', 'BK', 'BA', 'BKNG', 'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BR', 'BRO', 'BF.B', 'CHRW', 'CDNS', 'CZR', 'CPT', 'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CARR', 'CTLT', 'CAT', 'CBOE', 'CBRE', 'CDW', 'CE', 'CNC', 'CNP', 'CDAY', 'CF', 'CRL', 'SCHW', 'CHTR', 'CVX', 'CMG', 'CB', 'CHD', 'CI', 'CINF', 'CTAS', 'CSCO', 'C', 'CFG', 'CTXS', 'CLX', 'CME', 'CMS', 'KO', 'CTSH', 'CL', 'CMCSA', 'CMA', 'CAG', 'COP', 'ED', 'STZ', 'CEG', 'COO', 'CPRT', 'GLW', 'CTVA', 'COST', 'CTRA', 'CCI', 'CSX', 'CMI', 'CVS', 'D

In [4]:
def find_transcript(ticker, date_yr, date_qtr, i):
    try:
        url = 'https://roic.ai/transcripts/' + ticker + '?y=' + str(date_yr) + '&q=' + str(date_qtr)
        html_text = requests.get(url).text
        soup = BeautifulSoup(html_text, "html.parser")
        # https://jsonformatter.org/
        # finds the 15th <script/> and strips the unnecsssary data so it can be read in JSON
        script = soup.find_all('script')[15].text.strip()  
        data = json.loads(script)
        transcript_data = data['props']['pageProps']['transcriptdata']['content']  # loads the transcript content
        date1 = datetime.strptime(data['props']['pageProps']['data']['data']['earningscalls'][i]['date'], '%Y-%m-%d %H:%M:%S')
        #date1 = 'day 0', earnings transcript release date
        ntr = yf.Ticker(ticker)
        hist1 = ntr.history(start=date1, end=date1 + timedelta(days=10))  # day 0 to day 10
        hist2 = ntr.history(start=date1, end=date1 + timedelta(days=30))   # day 0 to day 30
        hist3 = ntr.history(start=date1, end=date1 + timedelta(days=50))
        hist4 = ntr.history(start=date1, end=date1 + timedelta(days=70))      
        hist5 = ntr.history(start=date1, end=date1 + timedelta(days=90))   
        hist6 = ntr.history(start=date1 + timedelta(days=1), end=date1 + timedelta(days=10))   #day 1 to day 11
        hist7 = ntr.history(start=date1 + timedelta(days=1), end=date1 + timedelta(days=30))   #day 1 to day 31
        hist8 = ntr.history(start=date1 + timedelta(days=1), end=date1 + timedelta(days=50))   
        hist9 = ntr.history(start=date1 + timedelta(days=1), end=date1 + timedelta(days=70))
        hist10 = ntr.history(start=date1 + timedelta(days=1), end=date1 + timedelta(days=90))   
        percentage_change1 = find_percentage_change(hist1)
        percentage_change2 = find_percentage_change(hist2)
        percentage_change3 = find_percentage_change(hist3)
        percentage_change4 = find_percentage_change(hist4)
        percentage_change5 = find_percentage_change(hist5)
        percentage_change6 = find_percentage_change(hist6)
        percentage_change7 = find_percentage_change(hist7)
        percentage_change8 = find_percentage_change(hist8)
        percentage_change9 = find_percentage_change(hist9)
        percentage_change10 = find_percentage_change(hist10)
            
        return transcript_data, percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10

    except:
        return None, None, None, None, None, None, None, None, None, None, None 

def find_percentage_change(hist):
    try:
        percentage_change = ((hist['Open'][-1])/(hist['Open'][0])-1) # day 1 to day +30
    except:
        return None
        
    return percentage_change

def ticker_transcript(ticker):
    url = 'https://roic.ai/transcripts/' + ticker

    html_text = requests.get(url).text
    soup = BeautifulSoup(html_text, "html.parser")

    script = soup.find_all('script')[15].text.strip()
    data = json.loads(script)
    first_er_date = data['props']['pageProps']['data']['data']['earningscalls'][0]
    last_er_date = data['props']['pageProps']['data']['data']['earningscalls'][-1]

    first_er_date_yr = first_er_date['year']
    first_er_date_qtr = first_er_date['quarter']
    last_er_date_yr = last_er_date['year']

    yr_difference = first_er_date_yr-last_er_date_yr
    date_yr = first_er_date_yr
    date_qtr = first_er_date_qtr

    arr_ticker_infos = np.empty((0, 13), str)
    arr_full_transcript = np.array([], str) #array of all of the transcripts, every year and quarter
    q = -1
    for i in range(0, yr_difference + 1):
        if date_yr == first_er_date_yr:
            for j in range(0, first_er_date_qtr):
                q += 1
                full_transcript, percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10 = find_transcript(ticker, date_yr, date_qtr, q)
                arr_full_transcript = np.append(arr_full_transcript, full_transcript)
                arr_ticker_infos = np.append(arr_ticker_infos, np.array([[ticker, date_yr, date_qtr, percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10]]), axis=0)
                date_qtr -= 1
                if date_qtr == 0:
                    date_qtr = 4
        else:
            for j in range(0, 4):
                q += 1
                full_transcript, percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10 = find_transcript(ticker, date_yr, date_qtr, q)
                arr_full_transcript = np.append(arr_full_transcript, full_transcript)
                arr_ticker_infos = np.append(arr_ticker_infos, np.array([[ticker, date_yr, date_qtr, percentage_change1, percentage_change2, percentage_change3, percentage_change4, percentage_change5, percentage_change6, percentage_change7, percentage_change8, percentage_change9, percentage_change10]]), axis=0)

                date_qtr -= 1
                if date_qtr == 0:
                    date_qtr = 4
        date_yr -= 1
    #1. gets tf idf, and cosine similarity
    arr_tf_idf = np.array([])
    cleaned_transcript_list = np.array([])
    flipped_full_transcript = arr_full_transcript[::-1] #flip order
    
    for i in range(0, len(flipped_full_transcript)):
        cleaned_transcript_list = np.array([])
        try:
            rolling_transcript = flipped_full_transcript[i - 4:i]
            for j in rolling_transcript:
                single_transcript = clean_text(j.split("\n"))
                
                cleaned_transcript_list = np.append(cleaned_transcript_list, np.array([single_transcript]))
            tfidf_vec = tf_idf(''.join(cleaned_transcript_list))
            arr_tf_idf = np.append(arr_tf_idf, np.array([tfidf_vec]))
            
        except:
            tfidf_vec = None
            arr_tf_idf = np.append(arr_tf_idf, np.array([tfidf_vec]))
            continue

    arr_tf_idf = arr_tf_idf[::-1] #flip order
 
    arr_cosine_similarities = find_cosine_similarities(arr_tf_idf)
    arr_cosine_similarities = np.reshape(arr_cosine_similarities, (arr_cosine_similarities.shape[0], 1))
    arr_ticker_infos = np.concatenate((arr_ticker_infos, arr_cosine_similarities), axis=1)
    
    #2. gets Loughran and Mcdonalds Sentiment Score [positiveA, negativeA, polarityA, subjectivityA, positiveB, negativeB, polarityB, subjectivityB]
    arr_full_LM = find_LM(arr_full_transcript) #returns a (#, 8) 2D list
    arr_ticker_infos = np.concatenate((arr_ticker_infos, arr_full_LM), axis=1) 
    
    #3. gets word complexity information (including Flesch Kincaid, Flesch, Gunning Fog, and Smog index)
    arr_word_complexity = get_arr_word_complexity(arr_full_transcript)
    arr_ticker_infos = np.concatenate((arr_ticker_infos, arr_word_complexity), axis=1)
    
    # Include Finbert?
    
    #Deletes the most recent data (first index) since 90 days has not passed yet.
    arr_ticker_infos = arr_ticker_infos[1:,:]
    #arr_ticker_infos in the 2D array form of [['AAPL' '2022' '2'...], ['AAPL' '2022' '1'...], ['AAPL' '2021' '4'...]]
    #['ticker', 'yr', 'qtr', '%change etc', 'tf-idf_cos_sim', LM{'posA' 'negA' 'polA' 'subA' 'posB' 'negB' 'polB' 'subB'}, Complexity{FKsafe harbour' 'FKQ&A' 'GFsafe harbour' 'GFQ&A' ...}] 
    return arr_ticker_infos 

    
def get_arr_word_complexity(arr_full_transcript):
    arr_word_complexity = np.array([])
    for transcript in arr_full_transcript:
        try:
            cleaned_safe_harbour, cleaned_questions = split_transcript(transcript)

            fk_score_safe_harbour = find_flesch_kincaid(cleaned_safe_harbour)
            fk_score_questions = find_flesch_kincaid(cleaned_questions)

            gf_score_safe_harbour = find_gunning_fog(cleaned_safe_harbour)
            gf_score_questions = find_gunning_fog(cleaned_questions)

            smog_score_safe_harbour = find_smog(cleaned_safe_harbour)
            smog_score_questions = find_smog(cleaned_questions)

            fe_score_safe_harbour = find_flesch(cleaned_safe_harbour)
            fe_score_questions = find_flesch(cleaned_questions)
            
        except:
            fk_score_safe_harbour = None  
            fk_score_questions = None
            gf_score_safe_harbour = None
            gf_score_questions = None
            smog_score_safe_harbour = None
            smog_score_questions = None
            fe_score_safe_harbour = None
            fe_score_questions = None
        arr_word_complexity = np.append(arr_word_complexity, np.array([fk_score_safe_harbour, fk_score_questions, gf_score_safe_harbour, gf_score_questions, smog_score_safe_harbour, smog_score_questions, fe_score_safe_harbour, fe_score_questions]))
    arr_word_complexity = arr_word_complexity.reshape(arr_full_transcript.shape[0],8)#change 2 to total outputs 
    return arr_word_complexity


def find_flesch(text):
    try:
        r = Readability(text)
        f = r.flesch()
        return f.score
    except:
        return None

def find_smog(text):
    try:
        r = Readability(text)
        smog = r.smog()
        return smog.score
    except:
        return None

def find_gunning_fog(text):
    try:
        r = Readability(text)
        gf= r.gunning_fog()
        return gf.score
    except:
        return None

def find_flesch_kincaid(text):
    try:
        r = Readability(text)
        fk = r.flesch_kincaid()
        return fk.score
    except:
        return None

def split_transcript(mystr):    
    mystr = mystr.lower()
    mystr = mystr.split("\n")
    transcript_safe_harbour, transcript_questions = "", ""
    p = 0
    for i in range(0, len(mystr)):
        if ("first question" in mystr[i] and "operator" in mystr[i]) or (i>1 and p==0 and ("first question" in mystr[i] or "go ahead" in mystr[i]) or (i>2 and "first" in mystr[i] and "operator instructions" in mystr[i])):
            p = 1
            transcript_safe_harbour = mystr[0:i+1]
            transcript_questions = mystr[i + 1:-1]
    cleaned_safe_harbour = clean_text(transcript_safe_harbour)
    cleaned_questions = clean_text(transcript_questions)

    return cleaned_safe_harbour, cleaned_questions

def find_LM(arr_full_transcript): #arr_full_transcript is 1D
    arr_full_LM = np.array([])
    cleaned_safe_harbour, cleaned_questions = "", ""
    temporary_val = 0
    for transcript in arr_full_transcript:
        temporary_val += 1
        try:
            cleaned_safe_harbour, cleaned_questions = split_transcript(transcript)
            LM_sentiment_safe_harbour = np.array(find_LM_score(cleaned_safe_harbour))
            LM_sentiment_questions = np.array(find_LM_score(cleaned_questions))
            arr_full_LM = np.append(arr_full_LM, LM_sentiment_safe_harbour)
            arr_full_LM = np.append(arr_full_LM, LM_sentiment_questions)
        except:
            arr_full_LM = np.append(arr_full_LM, np.array([None, None, None, None, None, None, None, None]))

    arr_full_LM = np.reshape(arr_full_LM, (int((arr_full_LM.shape[0])/8), 8))
    return arr_full_LM #returns a (#even, 8) 2D list

def find_LM_score(text): #text is a string
    lm = ps.LM()
    tokens = lm.tokenize(text)
    score = lm.get_score(tokens) #score is a dictionary
    LM_score = list(score.values()) #turns into a 1D list
    return LM_score #returns a 1D list

def tf_idf(transcript):
    #1. Removes stop words, 2. finds tf.idf value, used as a weight
    transcript = remove_stop_words(transcript)
    transcript = list(transcript.splitlines())
    vectoriser = TfidfVectorizer(
        lowercase=True,
        max_features=100,
        ngram_range=(1, 3), # 1 to trigram as they are all common in finance (i.e. earnings per share, free cash flow etc.)
        stop_words='english'
    )
    tfidf_vec = vectoriser.fit_transform(transcript)
    # tfid_tokens = vectoriser.get_feature_names_out()
    # rowlist = []
    # for i in range(0, len(transcript)):
    #     rowlist.append('sentence' + str(i))
    # df_tfidvec = pd.DataFrame(data=sent_vt.toarray(), columns=tfid_tokens)
    # df_tfidvec = df_tfidvec.T
    # df_tfidvec = df_tfidvec.iloc[:, :].mean(axis=1)
    # 
    # df_tfidvec = pd.DataFrame(data=df_tfidvec, columns=['sentence'])
    # df_tfidvec = df_tfidvec.sort_values(by=['sentence'], ascending=False)
    return tfidf_vec

def find_cosine_similarities(arr_tf_idf):
    arr_cosine_similarities = np.array([])
    for i in range(0, (arr_tf_idf.shape[0])):
        try:
            cosine_similarities = linear_kernel(arr_tf_idf[i], arr_tf_idf[i+1])
        except:
            cosine_similarities = None
        arr_cosine_similarities = np.append(arr_cosine_similarities, cosine_similarities)
    return arr_cosine_similarities
 
def clean_text(transcript): #transcript is in format ["a", "b", "c"]
    transcript = '\n'.join(transcript)
    transcript = transcript.lower()
    # turns 'end sentence.start' to 'end sentence. start' with space in between
    transcript = re.sub(r'\.([a-zA-Z])', r'. \1', transcript)
    transcript = re.sub(r'\?([a-zA-Z])', r'. \1', transcript)
    transcript = re.sub(r'\!([a-zA-Z])', r'. \1', transcript)
    # replace q1,2,3,4 with q
    transcript = re.sub("q[1-4]", "q", transcript)
    # replace 20xx with 2000
    transcript = re.sub("20[0-2][0-9]", "2000", transcript)
    # deletes all commments that begins with 'Operator: ...'
    temp = transcript.split('\n') #TURNS BACK TO LIST
    i = 0
    r = 0
    try:
        while (i != len(temp) - 1) and r < 80:
            r+=1
            if 'operator:' in temp[i]:
                del temp[i]
            i += 1
            
    except:
        temp = temp
    temp = '\n'.join(temp)
    temp = re.sub(r'\.([a-zA-Z])', r'. \1', temp)
    temp = re.sub(r'\?([a-zA-Z])', r'. \1', temp)
    temp = re.sub(r'\!([a-zA-Z])', r'. \1', temp)
    temp = temp.split('\n') #TURNS BACK TO LIST

    #deletes speaker name:
    arr_speaker_name = []
    for i in range(0, len(temp)):
        a = temp[i].split()[0:5]  # gets the first 5 words
        for j in range(0, len(a)):
            if ':' in a[j]:
                k = list(a[j])
                del k[-1]
                p = (a[0:j])
                for l in p:
                    if l != '' and l not in arr_speaker_name:
                        arr_speaker_name.append(l)
                if ''.join(k) != '' and ''.join(k) not in arr_speaker_name:
                    arr_speaker_name.append(''.join(k))
    temp = '\n'.join(temp)

    temp = re.sub(':', ' ', temp)
    for i in arr_speaker_name:  # removes all speaker names from transcript
        try:
            temp = re.sub('\s+', ' ', temp)  # replace multiple space to single space
            temp = re.sub(r'\s'+i+r'\s', ' ', temp) #makes sure embedded words aren't deleted, such as 'tim' in estimate
            temp = re.sub(r'\s' + i+r'\.', ' ', temp)
            temp = re.sub(r'\s' + i+r'\?', ' ', temp)
            temp = re.sub(r'\s' + i+r'\,', ' ', temp)
            temp = re.sub(r'\s' + i+r'\'', ' ', temp)
        except:
            temp = re.sub('\s+', ' ', temp)
            continue
    temp = re.sub('\s+', ' ', temp)  # replace multiple space to single space
    return temp #returns a string

def remove_stop_words(text):
    #lemmatize and remove stop words and punctuation
    stop_words = spacy.load('en_core_web_sm')
    #add new stop words
    stop_words.Defaults.stop_words.add("operator")
    stop_words.Defaults.stop_words.add("analyst")
    stop_words.Defaults.stop_words.add("quarter")
    stop_words.Defaults.stop_words.add("year")
    doc = stop_words(text)
    lemmatised_text = ""
    for token in doc:
        if not token.is_stop and not token.is_punct:
            lemma = token.lemma_
            if lemma == "-PRON-":
                lemma = "it"
            lemmatised_text += (lemma + " ")
    text = lemmatised_text
    return text

In [None]:
# ticker = 'UAL'
# arr_output = ticker_transcript(ticker)

#BRK.B GOOGL? SCHW CEG

#AEP ADI

In [8]:
print(len(spy_tickers_list))
batch = spy_tickers_list[412:]
print(batch)

503
['STX', 'SEE', 'SRE', 'NOW', 'SHW', 'SBNY', 'SPG', 'SWKS', 'SJM', 'SNA', 'SEDG', 'SO', 'LUV', 'SWK', 'SBUX', 'STT', 'STE', 'SYK', 'SIVB', 'SYF', 'SNPS', 'SYY', 'TMUS', 'TROW', 'TTWO', 'TPR', 'TGT', 'TEL', 'TDY', 'TFX', 'TER', 'TSLA', 'TXN', 'TXT', 'TMO', 'TJX', 'TSCO', 'TT', 'TDG', 'TRV', 'TRMB', 'TFC', 'TWTR', 'TYL', 'TSN', 'USB', 'UDR', 'ULTA', 'UNP', 'UAL', 'UPS', 'URI', 'UNH', 'UHS', 'VLO', 'VTR', 'VRSN', 'VRSK', 'VZ', 'VRTX', 'VFC', 'VTRS', 'VICI', 'V', 'VNO', 'VMC', 'WAB', 'WBA', 'WMT', 'WBD', 'WM', 'WAT', 'WEC', 'WFC', 'WELL', 'WST', 'WDC', 'WRK', 'WY', 'WHR', 'WMB', 'WTW', 'GWW', 'WYNN', 'XEL', 'XYL', 'YUM', 'ZBRA', 'ZBH', 'ZION', 'ZTS']


In [9]:
for ticker in batch:
    print(ticker)
    ticker_output = ticker_transcript(ticker)
    df = pd.DataFrame(ticker_output)
    arr_na_cleaned = df.dropna()
    arr_na_cleaned = arr_na_cleaned.to_numpy()
    arr_na_cleaned = arr_na_cleaned[:-1,:]
    #delete last row of each ticker where TFIDF = 1
    with open('batch2.csv', 'a+') as csvfile:
        np.savetxt(csvfile, arr_na_cleaned, delimiter=',', fmt='%s')
        
    print("end")

STX
end
SEE
end
SRE
end
NOW
end
SHW
end
SBNY
end
SPG
end
SWKS
end
SJM
end
SNA
end
SEDG
end
SO
end
LUV
end
SWK
end
SBUX
end
STT
end
STE
end
SYK
end
SIVB
end
SYF


In [None]:
temp = arr_output
df = pd.DataFrame(temp)
arr_na_cleaned = df.dropna()
arr_na_cleaned = arr_na_cleaned.to_numpy()
#['ticker', 'yr', 'qtr', '%change', 'tf-idf_cos_sim', LM{'posA' 'negA' 'polA' 'subA' 'posB' 'negB' 'polB' 'subB'}, Complexity{FKsafe harbour' 'FKQ&A' 'GFsafe harbour' 'GFQ&A' ...}] 


X_data = arr_na_cleaned[:,4:]
Y_data = arr_na_cleaned[:,3]

In [None]:
from numpy import loadtxt
file = open('batch1.csv', 'rb')
data = np.loadtxt(file, delimiter=",",dtype='str')

print(data)

np.savetxt('batch2.csv', data, delimiter=',', fmt='%s')


                                     

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


#scales the X datasets within a range of 0 to 1
scaler = MinMaxScaler()
X = scaler.fit_transform(X) 

#splits the dataset into 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) 


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
clf = LinearRegression()
clf.fit(X_train, Y_train)

In [None]:
Y_pred = clf.predict(X_test)

In [None]:
r2_score(Y_test, Y_pred)

In [None]:
arr_output