In [100]:
import numpy as np
import pandas as pd

import nltk

from data_proc.proc_json import convert_time, load_individual_transcript
from data_proc import proc_json

from nltk.stem import WordNetLemmatizer

from sklearn.ensemble import RandomForestRegressor

from os import listdir

import glob

In [113]:
def get_company_name(file_name: str) -> str:
    company = file_name.replace(".json", "")
    company = company.replace("./test/", "")
    return company

In [114]:
company_lst = [get_company_name(f) for f in glob.glob("./test/*.json")]
print(company_lst)

['ABBV']


# Essential Datasets

In [73]:
COMPANY_PATH = "./hackathon_data/companies.csv"
JSON_PATH = "./hackathon_data/company_transcripts/"
LMD_PATH = "./sentiment_data/LoughranMcDonald_SentimentWordLists_2018.xlsx"


LMD_Dataset = pd.read_excel(
    LMD_PATH,
    sheet_name=[
        "Negative", "Positive",
        "Uncertainty", "Litigious",
        "StrongModal", "Constraining"],
    header=None
)

LMD_hash = dict()
for k, v in LMD_Dataset.items():
    LMD_hash[k] = list(map(lambda x: x[0], v.values))


TYPES = [
    "Negative", "Positive",
    "Uncertainty", "Litigious",
    "StrongModal", "Constraining"
]


# 26 words.
financial_dataset = pd.read_excel(
    # "/Users/tianyudu/Documents/TD-Rotman-FinHub-TDMDAL-Hackathon/sentiment_data/Finance_Dic.xlsx"
    "./sentiment_data/Finance_Dic.xlsx"
)

POS_LST = list(financial_dataset["positive"].values)
NEG_LST = list(financial_dataset["negative"].values)

features = ['d_Negative', 'd_Positive', 'd_Uncertainty',
       'd_Litigious', 'd_StrongModal', 'd_Constraining', 'd_Pos_26',
       'd_Neg_26', 'qa_Negative', 'qa_Positive', 'qa_Uncertainty',
       'qa_Litigious', 'qa_StrongModal', 'qa_Constraining', 'qa_Pos_26',
       'qa_Neg_26']

# Train Models

In [84]:
df_train = pd.read_csv("./sentiment_data/QA_LMD_data_all_returns.csv")
df_train.dropna(inplace=True)

In [85]:
X_train = df_train[features].values
y_train = df_train["nearest_day_return"].values.squeeze()

In [86]:
config = {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
model = RandomForestRegressor(**config)

In [87]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=4,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=400, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

# Load Dataset

In [57]:
def qa_check(sentence: str) -> bool:
    sentence = sentence.lower()
    if len(sentence) > 100:
        return False
    if "questions" in sentence and "answers" in sentence:
        return True
    if "question" in sentence and "answer" in sentence:
        return True
    return False

In [58]:
ds_dis = dict()
ds_qa = dict()

places, length = list(), list()
failed = 0
for company in company_lst:
    all_transcript = proc_json.load_individual_transcript(
        company=company, path="./test/")
    for transcript_id in all_transcript["title"].keys():
        unique_id = company + "_" + transcript_id

        title = all_transcript["title"][transcript_id]
        date = all_transcript["date"][transcript_id]
        body = all_transcript["body"][transcript_id]
        # Total length of body.
        length.append(len(body))

        qa_begins = None
        for p, sentence in enumerate(body):
            if qa_check(sentence):
                qa_begins = p
                break
        if qa_begins is None:
            # If we cannot find such QA identifier.
            # Assume 0.3 Speech + 0.7 QA (aggregate prior).
            qa_begins = int(len(body) * 0.3)
            failed += 1
        places.append(qa_begins)
        # Split the dataset
        ds_dis.update({str(unique_id): " ".join(body[:qa_begins])})
        ds_qa.update({str(unique_id): " ".join(body[qa_begins:])})

# Parse Dataset

In [59]:
def get_score(
    body: str,
    prefix: str
) -> np.ndarray:
    """
    Compute sentiment for each body paragraph.
    Returns a dictionary of six values,
    counts the number of occurences of each type of words.
    """
    counts = dict((prefix + k, 0) for k in LMD_hash.keys())
    counts.update({
        prefix + "Pos_26": 0,
        prefix + "Neg_26": 0
    })

    # Tokenize.
    tokens = nltk.word_tokenize(body)
    lemmatizer = WordNetLemmatizer()
    for word_type in TYPES:
        for w in tokens:
            w = w.lower()
            c = lemmatizer.lemmatize(w)
            if c.upper() in LMD_hash[word_type]:
                counts[prefix + word_type] += 1
            if c.lower() in POS_LST:
                counts[prefix + "Pos_26"] += 1
            if c.lower() in NEG_LST:
                counts[prefix + "Neg_26"] += 1
    return counts

In [95]:
D_SPLITTED_BDOY = ds_dis
Q_SPLITTED_BDOY = ds_qa

print(f"Number of companies: {len(company_lst)}")
df_collection = {
    "ID": [],
    "Code": [],
    "Time": [],
    "d_Negative": [],
    "d_Positive": [],
    "d_Uncertainty": [],
    "d_Litigious": [],
    "d_StrongModal": [],
    "d_Constraining": [],
    "d_Pos_26": [],
    "d_Neg_26": [],
    "qa_Negative": [],
    "qa_Positive": [],
    "qa_Uncertainty": [],
    "qa_Litigious": [],
    "qa_StrongModal": [],
    "qa_Constraining": [],
    "qa_Pos_26": [],
    "qa_Neg_26": []
}

for num, company in enumerate(company_lst):
    print(f"Current Company: {company}")
    data = load_individual_transcript(company, path="./test/")
    trainscript_ids = list(data["title"].keys())
    for i in trainscript_ids:
        transcript_code = str(company) + "_" + str(i)

        t = data["date"][i]
        date = convert_time(t)

        # Compute scores for each part.
        discussion_part = D_SPLITTED_BDOY[transcript_code]
        qa_part = Q_SPLITTED_BDOY[transcript_code]

        discussion_counts = get_score(discussion_part, prefix="d_")
        qa_counts = get_score(qa_part, prefix="qa_")

        info = {
            "ID": i,
            "Code": transcript_code,
            "Time": date
        }

        info.update(discussion_counts)
        info.update(qa_counts)

        for k, v in info.items():
            df_collection[k].append(v)
df = pd.DataFrame.from_dict(df_collection)

Number of companies: 1
Current Company: ABBV


In [96]:
df.head()

Unnamed: 0,ID,Code,Time,d_Negative,d_Positive,d_Uncertainty,d_Litigious,d_StrongModal,d_Constraining,d_Pos_26,d_Neg_26,qa_Negative,qa_Positive,qa_Uncertainty,qa_Litigious,qa_StrongModal,qa_Constraining,qa_Pos_26,qa_Neg_26
0,500,ABBV_500,2021-10-31 12:30:00.000000,20,55,18,15,8,6,414,36,85,110,62,0,31,8,468,138
1,509,ABBV_509,2021-10-31 12:30:00.000000,32,92,40,18,18,7,726,24,69,58,76,9,45,12,378,330
2,679,ABBV_679,2021-10-31 12:30:00.000000,22,121,41,14,18,8,780,36,85,78,58,11,37,5,204,270


In [97]:
X_test = df[features].values
pred = model.predict(X_test)

In [118]:
result = pd.DataFrame({
    "id": df.ID,
    "prediction": pred
})

In [120]:
result.head()

Unnamed: 0,id,prediction
0,500,6.9e-05
1,509,-0.002006
2,679,0.001975


In [121]:
result.to_csv("./test_result.csv")