In [2]:
import numpy as np
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from tqdm import tqdm
import datetime

import logging
logging.getLogger('yfinance').setLevel(logging.CRITICAL)

import warnings
warnings.simplefilter(action="ignore", category=[SettingWithCopyWarning, DeprecationWarning])

import spacy

import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()

import gensim

import tomotopy as tp

import yfinance as yf

#######################
from utils.preprocesing_token import *
from utils.hdp_training import *
from utils.evaluation import *

# Step 1. Data Loading

In [None]:
# load the ticker and gvkey data
comp_info = pd.read_csv(
    'data/tick_gvkey_gics.csv'
    )

# the columns of interest are:
# conm: company name
# gvkey: unique identifier for the company (Gvkey from S&P Capital IQ Compustat)
# tic: ticker symbol
# gsector: sector code
# gind: industry group code
# ggroup: industry code
compdesc_info = comp_info[['conm', 'gvkey', 'tic', 'gsector', 'gind', 'ggroup']]
# get the unique gvkeys of the data
compdesc_info = compdesc_info.drop_duplicates(subset='gvkey')

# convert gvkey to string to make it 6 digits
compdesc_info = compdesc_info.astype(str)
compdesc_info['gvkey'] = compdesc_info['gvkey'].apply(lambda x: x.zfill(6))

In [None]:
# read transcript data
pru_data = pd.read_parquet(
    'data/sp500_cc_transcripts_2014_2023.parquet', engine='pyarrow'
    )

# filter out the pru_data for QnA only
# 'transcriptComponentTypeId' = 3 and 4 for the QnA section
qna_transcript = pru_data[pru_data['transcriptComponentTypeId'] != 2]
# combine all the individual QnA transcripts into one single QnA for each company
# for each quarrterly report
qna_transcript = qna_transcript.groupby(['gvkey', 'doc_date'])['componentText'].apply(lambda x: ' '.join(x)).reset_index()

In [None]:
# merge qna_transcript with compdesc_info on gvkey to get the company name and tickers
qna_transcript = qna_transcript.merge(compdesc_info, on='gvkey', how='left')

# Step 2. Data Preprocessing

There are two possible ways of tokenizing the input data for this project:
1. Using the PoS except the designated ones 
    * e.g ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'PART', 'DET', 'ADP', 'SPACE', 'NUM', 'SYM']
2. Using only the noun PoS

This noteboook will used the second approach, but the first one is also implemented in the codes.

In [None]:
## For the first attempt use the following code to tokenize the text

nlp = spacy.load("en_core_web_sm")
# set noun=True for the second method of tokenization
# set noun=False for the first method of tokenization
tokens = tokenize_text(qna_transcript, nlp, noun=True)
qna_transcript = add_tokenized_text(qna_transcript, tokens)

## After the first attempt, save the processed data for future use

with open('data/qna_tokens_for_topic.pkl', 'wb') as f:
    pickle.dump(tokens, f)

qna_transcript.to_parquet('data/qna_transcript_noun_token.parquet', engine='pyarrow')

In [None]:
## Uncomment the following code to use the first method of tokenization

# with open('data/qna_noun_tokens_for_topic.pkl', 'wb') as f:
#     pickle.dump(tokens, f)
# # load tokens from pickle file
# with open('data/qna_tokens_for_topic.pkl', 'rb') as f:
#     tokens = pickle.load(f)
# # save the dataframe to parquet file
# qna_transcript.to_parquet('data/qna_transcript_token.parquet', engine='pyarrow')
# # code to load pre-tokenized data
# qna_transcript = pd.read_parquet(
#     'data/qna_transcript_token.parquet', engine='pyarrow'
#     )

In [None]:
## For future attempts, simply load the saved data

# load tokens from pickle file
with open('data/qna_noun_tokens_for_topic.pkl', 'rb') as f:
    tokens = pickle.load(f)
# code to load pre-tokenized noun data
qna_transcript = pd.read_parquet(
    'data/qna_transcript_noun_token.parquet', engine='pyarrow'
    )

In [None]:
# get the unique quarters for later use
quarter_lst = qna_transcript['doc_quarter'].unique().tolist()

# Step 3: Hierarchical Dirichlet Process (HDP) for Topic Modeling

In [None]:
## For first attempt, use the following code to train the HDP modle and allocate topics to the transcript

# train the HDP model
hdp_model_lst = train_hdp_model(qna_transcript, quarter_lst)
# allocate topics to the transcript
earnings_call_qt_list = get_earnings_call_w_topics(hdp_model_lst, qna_transcript)

In [None]:
## Uncomment the following code to use the first method of tokenization

# # save the list of models to file
# for i, item in enumerate(hdp_model_lst):
#     item.save(f'hdp_models/hdp_model_{quarter_lst[i]}.bin')
# # save earnings_call_qt_list to file
# with open ('data/earnings_call_qt_list.pkl', 'wb') as f:
#     pickle.dump(earnings_call_qt_list, f)

In [None]:
## For future attempts, simply load the saved models and allocated topics

# load the list of models from file
hdp_model_lst = []
for quarter in quarter_lst:
    mdl = tp.HDPModel.load(f'hdp_models/hdp_model_{quarter}.bin')
    hdp_model_lst.append(mdl)
# load earnings_call_qt_list from file
with open ('data/earnings_call_qt_list.pkl', 'rb') as f:
    earnings_call_qt_list = pickle.load(f)

# Step 4: Evaluation of the Results
## Step 4-1: Trending Words Change by Year

In [None]:
# for each hdp models, get the top 10 words for each topic
top_words = []
for i, hdp in enumerate(hdp_model_lst):
    top_words.append(get_hdp_topics(hdp, top_n=10))

In [None]:
total_top_wrds_by_qt = []
for i in range(len(top_words)):
    qt_top_wrds = []
    for word_scr_lst in top_words[i].values():
        for wrd, scr in word_scr_lst:
            qt_top_wrds.append(wrd)
    total_top_wrds_by_qt.append(qt_top_wrds)

In [None]:
total_top_words = []
for num_periods in range(len(top_words)):
    for key, value in top_words[num_periods].items():
        for words in value:
            total_top_words.append(words[0])

total_top_words = set(total_top_words)
total_top_words = list(total_top_words)
print(len(total_top_words))

# for each list in total_top_wrds_by_qt, get the count of the words in the list that appears in total_top_words
# and add as a column to a new dataframe
word_count_df = pd.DataFrame()
for i in range(len(total_top_wrds_by_qt)):
    word_count = []
    for word in total_top_words:
        word_count.append(total_top_wrds_by_qt[i].count(word))
    word_count_df[quarter_lst[i]] = word_count
word_count_df.index = total_top_words


In [None]:
# for each row in word_count_df, remove the rows with more than half of the values as 0
trimmed_word_count_df = word_count_df.loc[(word_count_df!=0).sum(axis=1) > 10]
trimmed_word_count_df.to_csv('data/quarterly_trimmed_word_count.csv')

In [None]:
# sum the columns of word_count_df based on the year of the column
yrly_word_count_df = word_count_df.copy()
yrly_word_count_df.columns = word_count_df.columns.to_timestamp().to_period('Y')
# groupby columns and sum
yrly_word_count_df = yrly_word_count_df.transpose().groupby(level=0).sum().transpose()

In [None]:
trim_yrly_word_count_df = yrly_word_count_df.loc[(yrly_word_count_df!=0).sum(axis=1) > 3]
trim_yrly_word_count_df.to_csv('data/yearly_trimmed_word_count.csv')

## Step 4-2: Performance Evaluation by Grouping