# Model Evaluator/Model Trainer

This notebook is used to train and evaluate the ability of our model to forecast volatility based on summarized transcript inputs.

## Dependencies

### Virtual Environment Dependencies

The model evaluator leverages the huggingface library. The following packages must be installed:

1. transformers
2. datasets
3. evaluate
4. torchmetrics

In [None]:
!pip install transformers datasets evaluate torchmetrics

### Imports

In [None]:
import torch
from datasets import load_dataset, Dataset, DatasetDict
from google.colab import drive, runtime
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
import torchmetrics
from tqdm import tqdm
import os
import datetime
import plotly.express as px
import pandas
import numpy as np
from collections import Counter
import sqlite3 as sq
from dataclasses import dataclass
from typing import Dict, List, Tuple, Callable
import evaluate


In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
drive.mount('/content/drive')

## Paths

This notebook requires a connection with the database containing all of the transcript data and the labels. This can be generated by running the FinancialModelPrep [notebook](https://colab.research.google.com/drive/1IPQLXCfiAfVXH-W30vYevktjQZnV6qt6?usp=sharing) notebook to create the database and pull the financial data along with the Summary [notebook](https://colab.research.google.com/drive/1Jv0knYxpoCyexYn6yA6TSIxOsX9nJY5L?usp=sharing) to summarize the transcripts.

In [None]:
project_path = "/content/drive/MyDrive/CSCI-5541/Project/"
data_path = os.path.join(project_path, "data")
model_path = os.path.join(project_path,"model")
classification_model_path = os.path.join(model_path,"classification")
tokenizer_base_path = os.path.join(project_path, "tokenizer")
fine_tuned_model_base_path = os.path.join(model_path, "fine_tuned_classification")

In [None]:
database_path = os.path.join(data_path, "earnings_transcripts_data.db")

In [None]:
if not os.path.exists(classification_model_path):
    os.makedirs(classification_model_path)
if not os.path.exists(tokenizer_base_path):
    os.makedirs(tokenizer_base_path)
if not os.path.exists(fine_tuned_model_base_path):
    os.makedirs(fine_tuned_model_base_path)

## Data Classes and DB Util

### Data Classes

Objects to hold data from tables

In [None]:
@dataclass
class Company:
    symbol: str

    def to_dict(self):
        return {
            "symbol": self.symbol
        }

@dataclass
class Transcript:
    symbol: str
    date: str
    year: int
    quarter: int
    transcript: str

    def to_dict(self):
        return {
            "symbol": self.symbol,
            "date": self.date,
            "year": self.year,
            "quarter": self.quarter,
            "transcript": self.transcript
        }


@dataclass
class Price:
    symbol: str
    date: str
    opening_price: float
    closing_price: float

    def to_dict(self):
        return {
            "symbol": self.symbol,
            "date": self.date,
            "opening_price": self.opening_price,
            "closing_price": self.closing_price,
        }

@dataclass
class Label:
    id: int
    symbol: str
    transcript_date: str
    price_day_of_meeting: float
    avg_value: float
    delta_days: int
    total_days: int
    avg_type: str
    label: str


@dataclass
class Predicted_Label:
    id: int
    true_label_id: int
    symbol: str
    transcript_date: str
    label: str
    summarization_method: str
    classification_method: str
    true_label_id: int

@dataclass
class TranscriptSummary:
    symbol: str
    date: str
    year: int
    quarter: int
    summarized_transcript: str
    summarization_method: str

    def to_dict(self):
        return {
            "symbol": self.symbol,
            "date": self.date,
            "year": self.year,
            "quarter": self.quarter,
            "summarized_transcript": self.summarized_transcript,
            "summarization_method": self.summarization_method,
        }

### DB Util
Utility for interacting with the database

In [None]:
class DB_Util:
    def __init__(self, database_path: str):
        self.con = sq.connect(database_path)

    
    def create_or_drop_table(self, sql: str) -> bool:
        cursor = self.con.cursor()
        try:
            cursor.execute(
                sql,
            )
            return True
        except Exception as e:
            print(e)
            return False
        finally:
            self.con.commit()
            cursor.close()
            del cursor

    def execute_query(self, sql: str, args: List) -> List:
        cursor = self.con.cursor()
        try:
            query_results = cursor.execute(
                sql,
                args
            )
            return query_results.fetchall()
        except Exception as e:
            print(e)
            return []
        finally:
            cursor.close()
            del cursor


    def insert_data(self, sql: str, data: List) -> bool:
        cursor = self.con.cursor()
        try:
            cursor.execute(
                sql,
                data                
            )
            return True
        except Exception as e:
            print(e)
            return False
        finally:
            self.con.commit()
            cursor.close()
            del cursor
    
    def close_connection(self):
        self.con.close()
        del self.con

## Database Connection and Operations

### Create Database Connection
In order to interact with the database, simply pass the path to the database to the constructor.

In [None]:
db_util = DB_Util(database_path=database_path)

### Create Predictions Table

In [None]:
table_definitions = [
    '''
    CREATE TABLE IF NOT EXISTS transcript_prediction(
        symbol CHAR[10] NOT NULL,
        year INTEGER,
        quarter INTEGER,
        date CHAR[10],
        summarization_method CHAR[80],
        classification_method CHAR[80],
        predicted_class CHAR[20],
        true_label_id INTEGER,
        FOREIGN KEY (symbol) REFERENCES company(symbol),
        FOREIGN KEY (true_label_id) REFERENCES label(id)
        PRIMARY KEY(symbol, year, quarter, date, summarization_method, classification_method, true_label_id)
    )    
    '''
]

table_creation_status = [
    db_util.create_or_drop_table(
        sql=table_definition
    )
    for table_definition in table_definitions
]
print(table_creation_status)

In [None]:
# db_util.create_or_drop_table(
#     '''
#     DROP TABLE IF EXISTS transcript_prediction 
#     '''    
# )

### Queries
Useful queries

In [None]:
def fetch_all_transcripts(db_util: DB_Util) -> List[Transcript]:
    transcript_records = db_util.execute_query(
        "SELECT * FROM transcript WHERE ?",
        [True]
    )
    return [
        Transcript(
            symbol=transcript_record[0],
            year=transcript_record[1],
            quarter=transcript_record[2],
            date=transcript_record[3],
            transcript=transcript_record[4],
        )
        for transcript_record in transcript_records
    ]

def fetch_transcript_by_company(db_util: DB_Util, company: str) -> List[Transcript]:
    transcript_records = db_util.execute_query(
        "SELECT * FROM transcript WHERE symbol = ?",
        [company]
    )
    return [
        Transcript(
            symbol=transcript_record[0],
            year=transcript_record[1],
            quarter=transcript_record[2],
            date=transcript_record[3],
            transcript=transcript_record[4],
        )
        for transcript_record in transcript_records
    ]

def fetch_all_summaries(db_util: DB_Util) -> List[Transcript]:
    transcript_records = db_util.execute_query(
        "SELECT * FROM transcript_summary WHERE ?",
        [True]
    )
    return [
        TranscriptSummary(
            symbol=transcript_record[0],
            year=transcript_record[1],
            quarter=transcript_record[2],
            date=transcript_record[3],
            summarized_transcript=transcript_record[4],
            summarization_method=transcript_record[5],
        )
        for transcript_record in transcript_records
    ]


def fetch_summaries_by_method(db_util: DB_Util, method: str) -> List[Transcript]:
    transcript_records = db_util.execute_query(
        "SELECT * FROM transcript_summary WHERE summarization_method = ?",
        [method]
    )
    return [
        TranscriptSummary(
            symbol=transcript_record[0],
            year=transcript_record[1],
            quarter=transcript_record[2],
            date=transcript_record[3],
            summarized_transcript=transcript_record[4],
            summarization_method=transcript_record[5],
        )
        for transcript_record in transcript_records
    ]


def fetch_summaries_by_company_and_method(db_util: DB_Util, company: str, method: str) -> List[Transcript]:
    transcript_records = db_util.execute_query(
        "SELECT * FROM transcript_summary WHERE symbol = ? and summarization_method = ?",
        [company, method]
    )
    return [
        TranscriptSummary(
            symbol=transcript_record[0],
            year=transcript_record[1],
            quarter=transcript_record[2],
            date=transcript_record[3],
            summarized_transcript=transcript_record[4],
            summarization_method=transcript_record[5],
        )
        for transcript_record in transcript_records
    ]


## Create Dataset

Huggingface supports conversion of databases into datasets. The cells below are used to construct a dataset by leveraging this functionality.

In [None]:
summarization_methods = db_util.execute_query(
    '''
    SELECT DISTINCT(summarization_method) FROM transcript_summary t
    ''', 
    []
)

In [None]:
for summarization_method in summarization_methods:
    print(summarization_method[0])

In [None]:
summarization_method = "human-centered-summarization/financial-summarization-pegasus-10-stride-10"

In [None]:
ds = Dataset.from_sql(
    f'SELECT t.symbol, t.summarized_transcript, t.date, t.year, l.label FROM transcript_summary t, label l where t.symbol = l.symbol and t.date = l.transcript_date and l.avg_type = "STANDARDDEVIATION2CLASS" and t.summarization_method="{summarization_method}"',
    con=db_util.con
)


In [None]:
ds

## Loading model and tokenizer
The cells below are used to download and save the model and tokenizer for the experiment. This approach allows caching as well by saving the model and tokenizer locally. 

In [None]:
# base_name = "DunnBC22/distilbert-base-uncased-Financial_Sentiment_Analysis"
base_name = "Svetlana0303/Classfication_longformer"

In [None]:
def load_model_tokenizer(name: str, tokenizer_dir: str, model_dir: str, total_classes: int):
    tokenizer_path = os.path.join(tokenizer_dir, name)
    model_path = os.path.join(model_dir, name)

    if os.path.exists(tokenizer_path) and os.listdir(tokenizer_path) != []:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(name)
        tokenizer.save_pretrained(tokenizer_path)
    

    if os.path.exists(model_path) and os.listdir(model_path) != []:
        model = AutoModelForSequenceClassification.from_pretrained(model_path, torchscript=True, num_labels=total_classes, ignore_mismatched_sizes=True)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(name, torchscript=True, num_labels=total_classes, ignore_mismatched_sizes=True)
        model.save_pretrained(model_path)
    return tokenizer, model


In [None]:
tokenizer, model = load_model_tokenizer(base_name, tokenizer_base_path, classification_model_path, 2)

In [None]:
model = model.to(device)

## Preprocessing Dataset
We are currently running 2 different types of experiments:
1. Testing model generalizablity. This approach divides the dataset into training test splits based on companies. If a model is able to accurately forecast the volatility, this supports that the approach generalizes well.
2. Test model ability to forecast for a given company. This approach splits the dataset into training test split based on year. If a model is able to accurately forecast under these conditions, it supports the claim that we may be able to train company specific models. 

In [None]:
label_map = {
  "VOLATILE": 0,
  "NOT VOLATILE": 1
}

training_set_companies = [
    "AAPL",
    "MSFT",
    "AMZN",
    "NVDA",
    "GOOGL",
    "GOOG",
    "BRK.B",
    "TSLA",
    "META",
    "UNH",
    "XOM",
    "JNJ",
    "JPM",
    "V",
    "PG",
    "MA"
]

training_set_years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [None]:
def tokenize_function(batch):
    tokenized_batch = tokenizer(batch["summarized_transcript"], padding="max_length", truncation=True)
    tokenized_batch["label"] = [label_map[label] for label in batch["label"]]
    return tokenized_batch

In [None]:
tokenized_ds = ds.map(tokenize_function, batched=True)

In [None]:
def divide_dataset(dataset, filter_type:str, filter_list: list):
    training_set = tokenized_ds.filter(lambda x: x[filter_type] in filter_list)
    test_set = tokenized_ds.filter(lambda x: x[filter_type] not in filter_list)
    return training_set, test_set

In [None]:
training_set, test_set = divide_dataset(ds, "year", training_set_years)
# training_set, test_set = divide_dataset(ds, "symbol", training_set_companies)


In [None]:
training_set, test_set

## Finetune model
Run the cells below to finetune the model. 

In [None]:
finetuned_model_path = os.path.join(fine_tuned_model_base_path, base_name)

### Training Args

Arguments for training. Due to memory related limitations, batches must be limited to 2 instances at most.

In [None]:
training_args = TrainingArguments(
    output_dir=finetuned_model_path,
    evaluation_strategy = "epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=2,
    # per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Metrics

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
torch.cuda.empty_cache()

### Finetuning

The cells below are used to finetune the model generated above. Due to our dataset being limited in size, we do not create a validation set. Due to the HuggingFace Trainer requiring an evaluation dataset, we pass the training set.

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    compute_metrics=compute_metrics,
    eval_dataset=training_set,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model(os.path.join(finetuned_model_path,"baseline_2"))

### Evaluation
The cells below are used to assess the model on the test set.

In [None]:
results = trainer.predict(test_set)

In [None]:
true_labels = torch.tensor(results.label_ids)

In [None]:
predicted_label = torch.argmax(torch.tensor(results.predictions), dim=1)

In [None]:
confusion_matrix = torchmetrics.ConfusionMatrix(task="multiclass", num_classes=2)

In [None]:
results.metrics

In [None]:
confusion_matrix(predicted_label, true_labels)

In [None]:
tokenizer.model_max_length

In [None]:
runtime.unassign()