## Setup

In [None]:
#Installing the EDGAR tool
!pip install --user edgartools

#Installing openai and tiktoken
pip install --user openai
!pip install tiktoken

#OpenAI update
!pip install --upgrade openai

#Installing parser library
!pip install lxml

#Installing imbalanced
!pip install imbalanced-learn

In [None]:
#Importing all necessary modules
import pandas as pd
import tiktoken
import re
import datetime
from openai import OpenAI
import numpy as np
import matplotlib.pyplot as plt
import csv
import seaborn as sns
from bs4 import BeautifulSoup
from edgar import *
from ast import literal_eval
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import svm
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.pipeline import Pipeline
from scipy.stats import randint
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
#Setting identity
from edgar import set_identity
set_identity("xxxxxxxxxxxxxxxxxxxxxx")

In [None]:
#Connecting to drive
from google.colab import drive
drive.mount('/content/drive')

## Libraries

https://scikit-learn.org/stable/
https://pandas.pydata.org/
https://numpy.org/

## Functions

References:

Parsing Text:
https://medium.com/@jorlugaqui/how-to-strip-html-tags-from-a-string-in-python-7cb81a2bbf44
Tiktoken:
https://github.com/openai/tiktoken

In [None]:
#Parsing Text
def parsing_text(text):
    soup = BeautifulSoup(text, 'lxml')
    parsed_text = soup.get_text(' ', strip = True)
    return parsed_text

In [None]:
#Counting Tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

## Tickers to CIK

In [None]:
#Import NYSE ciks
tickersNYSE =[]

with open('c:/Users/xxxxx/Downloads/tiNYSE.csv', 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        tickersNYSE.append(row[0])
        
tickersNYSE.remove('TICKER')

tickersNYSEa = tickersNYSE[:1750]
tickersNYSEb = tickersNYSE[1750:]

print(len(tickersNYSE))
print(len(tickersNYSEa))
print(len(tickersNYSEb))

Matching the tickers to a CIK in the EDGAR database and writing to a ciksA and ciksB file

In [None]:
#Finding CIK numbers

raw_tickers = []
ciks_a = []

for k in tickersNYSEb:
    company = Company(k)
    if company != None:
        raw_tickers.append(k)
        
for j in raw_tickers:
    cik = Company(j).cik
    if cik != None:
        ciks_a.append(cik)

with open('c:/Users/xxxxx/Downloads/ciksA.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(ciks_a)

Opening the two csv files and writing to a list. Combining the lists and writing to a new csv file

In [None]:
ciksA = []
ciksB = []


with open('c:/Users/xxxxx/Downloads/ciksA.csv', mode='r') as file:
    # Create a CSV reader object
    reader = csv.reader(file)
    
    # Iterate through each row in the CSV file and append it to the list
    for row in reader:
        ciks_total.extend(row)
        
with open('c:/Users/xxxxx/Downloads/ciksB.csv', mode='r') as file:
    # Create a CSV reader object
    reader = csv.reader(file)
    
    # Iterate through each row in the CSV file and append it to the list
    for row in reader:
        ciks_total.extend(row)
        
ciks_combined = ciksA + ciksB


with open('c:/Users/xxxxx/Downloads/ciks_numbers.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(ciks_combined)

## Getting the filings from the EDGAR Database

References:
https://pypi.org/project/edgartools/

Getting each year seperately

In [None]:
#Importing ciks

ciks_total = []
ciks_numbers = []


with open('c:/Users/xxxxx/Downloads/ciks_numbers.csv', mode='r') as file:
    # Create a CSV reader object
    reader = csv.reader(file)
    
    # Iterate through each row in the CSV file and append it to the list
    for row in reader:
        ciks_total.extend(row)

for c in ciks_total:
    ciks_numbers.append(int(c))


print(len(ciks_total))


#Get filings per Quarter
filings_2018_1 = get_filings(2018, 1, form=["8-K"])



Dict_Filter = {}
for i in range(len(filings_2018_1)):
    company_pdate= filings_2018_1[i]
    cik = company_pdate.cik
    if cik in ciks_numbers:
        company_date = company_pdate.filing_date
        key_p = str(cik) + '_' + str(company_date)
        company_text= company_pdate.text()
        company_final = company_text.split('</body>')[0].lstrip().split('</body>')[0]
        company_parsed = parsing_text(company_final)
        if key_p not in Dict_Filter:
            Dict_Filter[key_p] = {}
        Dict_Filter[key_p] = company_parsed
    else:
        continue

#Saving the output to csv
# open file for writing, "w" is writing
w = csv.writer(open("c:/Users/xxxxx/Downloads/1st2018.csv", "w",encoding='utf-8'))

# loop over dictionary keys and values
for key, val in Dict_Filter.items():

    # write every key and value to file
    w.writerow([key, val])

filings_2018_1 = get_filings(2018, 2, form=["8-K"])

Dict_Filter = {}
for i in range(len(filings_2018_1)):
    company_pdate= filings_2018_1[i]
    cik = company_pdate.cik
    if cik in ciks_numbers:
        company_date = company_pdate.filing_date
        key_p = str(cik) + '_' + str(company_date)
        company_text= company_pdate.text()
        company_final = company_text.split('</body>')[0].lstrip().split('</body>')[0]
        company_parsed = parsing_text(company_final)
        if key_p not in Dict_Filter:
            Dict_Filter[key_p] = {}
        Dict_Filter[key_p] = company_parsed
    else:
        continue

#Saving the output to csv
# open file for writing, "w" is writing
w = csv.writer(open("c:/Users/xxxxx/Downloads/2nd2018.csv", "w",encoding='utf-8'))

# loop over dictionary keys and values
for key, val in Dict_Filter.items():

    # write every key and value to file
    w.writerow([key, val])

filings_2018_1 = get_filings(2018, 3, form=["8-K"])

Dict_Filter = {}
for i in range(len(filings_2018_1)):
    company_pdate= filings_2018_1[i]
    cik = company_pdate.cik
    if cik in ciks_numbers:
        company_date = company_pdate.filing_date
        key_p = str(cik) + '_' + str(company_date)
        company_text= company_pdate.text()
        company_final = company_text.split('</body>')[0].lstrip().split('</body>')[0]
        company_parsed = parsing_text(company_final)
        if key_p not in Dict_Filter:
            Dict_Filter[key_p] = {}
        Dict_Filter[key_p] = company_parsed
    else:
        continue

#Saving the output to csv
# open file for writing, "w" is writing
w = csv.writer(open("c:/Users/xxxxx/Downloads/3rd2018.csv", "w",encoding='utf-8'))

# loop over dictionary keys and values
for key, val in Dict_Filter.items():

    # write every key and value to file
    w.writerow([key, val])


filings_2018_1 = get_filings(2018, 4, form=["8-K"])

Dict_Filter = {}
for i in range(len(filings_2018_1)):
    company_pdate= filings_2018_1[i]
    cik = company_pdate.cik
    if cik in ciks_numbers:
        company_date = company_pdate.filing_date
        key_p = str(cik) + '_' + str(company_date)
        company_text= company_pdate.text()
        company_final = company_text.split('</body>')[0].lstrip().split('</body>')[0]
        company_parsed = parsing_text(company_final)
        if key_p not in Dict_Filter:
            Dict_Filter[key_p] = {}
        Dict_Filter[key_p] = company_parsed
    else:
        continue

#Saving the output to csv
# open file for writing, "w" is writing
w = csv.writer(open("c:/Users/xxxxx/Downloads/4th2018.csv", "w",encoding='utf-8'))

# loop over dictionary keys and values
for key, val in Dict_Filter.items():

    # write every key and value to file
    w.writerow([key, val])

## Getting the filings for transactions

References:
https://pypi.org/project/edgartools/

In [None]:
with open('c:/Users/xxxxx/Downloads/Thesis Data Science/Data/SDC/ciks_transactions_NYSE.csv', mode='r') as file:
    # Create a CSV reader object
    reader = csv.reader(file)
    
    # Iterate through each row in the CSV file and append it to the list
    for row in reader:
        ciks_total.extend(row)

for c in ciks_total:
    ciks_numbers.append(int(c))


print(len(ciks_total))


ciksA = ciks_numbers[:296]
ciksB = ciks_numbers[296:]

print(len(ciksA))
print(len(ciksB))

Dict_Filter={}
for cik in ciksA:
    company = Company(cik)
    if company is not None:
        company_p = company.get_filings(form="8-K").filter(date="2017-12-31:2022-12-31")
        if company_p is not None:
            for i in range(len(company_p)):
                company_pdate= company_p[i]
                company_date = company_pdate.filing_date
                key_p = str(cik) + '_' + str(company_date)
                company_text= company_pdate.text()
                company_final = company_text.split('</body>')[0].lstrip().split('</body>')[0]
                company_parsed = parsing_text(company_final)
                if key_p not in Dict_Filter:
                    Dict_Filter[key_p] = {}
                Dict_Filter[key_p] = company_parsed
        else:
            continue
    else:
        continue


#Saving the output to csv
# open file for writing, "w" is writing
w = csv.writer(open("c:/Users/xxxxx/Downloads/filings_SDC_NYSE_A.csv", "w",encoding='utf-8'))

# loop over dictionary keys and values
for key, val in Dict_Filter.items():

    # write every key and value to file
    w.writerow([key, val])

## Converting to a dataframe

For each year and for the transactions

In [None]:
csv.field_size_limit(100000000)

# Create an empty dictionary to store the data
Dict_1_2018 = {}

# Open the CSV file for reading
with open("c:/Users/xxxxx/Downloads/Thesis Data Science/Output/1st2018.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        if len(row) == 2:
            cik, value = row
            Dict_1_2018[cik] = value

# Create an empty dictionary to store the data
Dict_2_2018 = {}

# Open the CSV file for reading
with open("c:/Users/xxxxx/Downloads/Thesis Data Science/Output/2nd2018.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        if len(row) == 2:
            cik, value = row
            Dict_2_2018[cik] = value

# Create an empty dictionary to store the data
Dict_3_2018 = {}

# Open the CSV file for reading
with open("c:/Users/xxxxx/Downloads/Thesis Data Science/Output/3rd2018.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        if len(row) == 2:
            cik, value = row
            Dict_3_2018[cik] = value
            
# Create an empty dictionary to store the data
Dict_4_2018 = {}

# Open the CSV file for reading
with open("c:/Users/xxxxx/Downloads/Thesis Data Science/Output/4th2018.csv", mode="r", encoding="utf-8") as file:
    csv_reader = csv.reader(file)

    # Iterate through each row in the CSV file
    for row in csv_reader:
        if len(row) == 2:
            cik, value = row
            Dict_4_2018[cik] = value

In [None]:
dft = pd.DataFrame(Dict_1_2018, index=['combined'])
df_2018_1 = dft.T
df_2018_1.reset_index(inplace=True)
df_2018_1[['cik','date']] = df_2018_1['index'].str.split('_', expand=True)
df_2018_1['cik'] = df_2018_1['cik'].astype(int)


dft = pd.DataFrame(Dict_2_2018, index=['combined'])
df_2018_2 = dft.T
df_2018_2.reset_index(inplace=True)
df_2018_2[['cik','date']] = df_2018_2['index'].str.split('_', expand=True)
df_2018_2['cik'] = df_2018_2['cik'].astype(int)


dft = pd.DataFrame(Dict_3_2018, index=['combined'])
df_2018_3 = dft.T
df_2018_3.reset_index(inplace=True)
df_2018_3[['cik','date']] = df_2018_3['index'].str.split('_', expand=True)
df_2018_3['cik'] = df_2018_3['cik'].astype(int)



dft = pd.DataFrame(Dict_4_2018, index=['combined'])
df_2018_4 = dft.T
df_2018_4.reset_index(inplace=True)
df_2018_4[['cik','date']] = df_2018_4['index'].str.split('_', expand=True)
df_2018_4['cik'] = df_2018_4['cik'].astype(int)


In [None]:
df_2018_1 = pd.concat([df_2018_1,df_2018_2], ignore_index=True)
df_2018_2 = pd.concat([df_2018_3,df_2018_4], ignore_index=True)

In [None]:
df_2018_1.to_csv('c:/Users/xxxxx/Downloads/dfNYSE_2018_1.csv', index=False)
df_2018_2.to_csv('c:/Users/xxxxx/Downloads/dfNYSE_2018_2.csv', index=False)

## Getting the transactions

In [None]:
#Get Deals dataframe
df_tickerdeals = pd.read_csv('c:/Users/xxxxx/Downloads/Deals & Dates NYSE.csv')

#Removing where CIK is 0
df_cik_transactions = df_tickerdeals[df_tickerdeals['cik'] != 0]

#Transforming the date to the format
df_cik_transactions['DATE'] = pd.to_datetime(df_cik_transactions['DATE'], format='%d/%m/%Y').dt.strftime('%Y-%m-%d')

#Writing to a file
df_cik_transactions.to_csv('c:/Users/xxxxx/Downloads/df_trans_NYSE_adj.csv')
        

## Removing large filings and matching the dataframe on CIKS

In [None]:
#Opening the NYSE Dataframes
df_combined= pd.read_csv('c:/Users/xxxxx/Downloads/df_NYSE_2018_1.csv')

In [None]:
#Counting tokens
df_combined['Tokens'] = df_combined['combined'].apply(lambda x: num_tokens_from_string(x, "cl100k_base"))


# omit 8-K files that are too long to embed
count_of_8000 = (df_combined['Tokens'] > 8000).sum()

print(count_of_8000)

condition_tokens = (df_combined['Tokens'] > 8000) 
df_combined = df_combined[~condition_tokens]

In [None]:
#Get Deals tickers
df_cik_transactions = pd.read_csv('c:/Users/xxxxx/Downloads/df_trans_NYSE_adj.csv')


#Adding transactions to dataframe
df_combined = df_combined.merge(df_cik_transactions, on='cik', how='left')


#Setting Transaction to 1
df_combined['Transaction'] = 1
df_combined.loc[df_combined['TICKER'].isna(), 'Transaction'] = 0

count_of_ones = (df_combined['Transaction'] == 1).sum()

#Removing after dates
df_combined = df_combined.sort_values(by = ['cik','date'])

condition = (df_combined['Transaction'] == 1) & (df_combined['date'] > df_combined['DATE'])
df_combined= df_combined[~condition]

#Writing to a file

df_combined.to_csv('c:/Users/xxxxx/Downloads/shortened/df_NYSE_2018_1_short.csv', index=False)

## Combining Files

Creating 4 combined files

In [None]:
df_1 = pd.read_csv('c:/Users/xxxxx/Downloads/df_NYSE_2018_1_short.csv')
df_2 = pd.read_csv('c:/Users/xxxxx/Downloads/df_NYSE_2018_1_short.csv')

df_combined = [pd.concat([df_1,df_2], ignore_index=True)

df_combined.to_csv('c:/Users/xxxxx/Downloads/df_NYSE_1_short.csv', index=False)

## Embedding the filings

References:
https://pypi.org/project/edgartools/

In [None]:
#Opening the dataframe
df_combined = pd.read_csv('c:/Users/xxxxx/Downloads/df_NYSE_4_short.csv')

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  


#Obtaining the Embeddings
api_key  = "xxxxxxxxxxxxxxxxxx"
client = OpenAI(api_key= api_key)


def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df_combined['ada_embedding'] = df_combined.combined.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))


#Write to a file
df_combined.to_csv('c:/Users/xxxxx/Downloads/Thesis Data Science/Embedded/df_NYSE_embedded_4.csv', index=False)

Upload to drive and use google collab to use GPU's.

## Cutting down Size

In [None]:
df_final_1 = pd.read_csv('/content/drive/My Drive/Thesis Backup/df_NYSE_embedded_1.csv')

In [None]:
df_final = pd.concat([df_final_1,df_final_2, df_final_3, df_final_4], ignore_index=True)

In [None]:
#Reducing Size
#Gettingtarget and non-target

grouped = df_final.groupby('Transaction')

df_acquired =  grouped.get_group(1)
df_not_acquired =  grouped.get_group(0)

In [None]:
#Getting number of Targets and Non-Targets in Final Dataset

uniqueValues = df_final['cik'].nunique()
uniqueValues1 = df_acquired['cik'].nunique()
uniqueValues2 = df_not_acquired['cik'].nunique()
print(uniqueValues)
print(uniqueValues1, uniqueValues2)

## Rebalancing Models

References:
https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
https://cookbook.openai.com/examples/classification_using_embeddings

In [None]:
# Getting X, y and companies
X_s = df_final.drop(['date','Transaction','Tokens','combined', 'DATE','TICKER','Unnamed: 0','index'], axis = 1)
y=  df_final['Transaction']

In [None]:
# Splitting the data based on ciks
train_ciks, test_ciks = train_test_split(df_final['cik'].unique(), test_size=0.2, random_state=42)

In [None]:
#Creating a boolian discriminator
train_mask = df_final['cik'].isin(train_ciks)
test_mask = df_final['cik'].isin(test_ciks)

In [None]:
#Splitting the Dataset
X_train_s = X_s[train_mask]
y_train = y[train_mask]

In [None]:
X_test_s = X_s[test_mask]
y_test = y[test_mask]

In [None]:
X_train_s.to_csv('/content/drive/My Drive/Thesis Backup/X_train_s.csv', index = False)
X_test_s.to_csv('/content/drive/My Drive/Thesis Backup/X_test_s.csv', index = False)
y_train.to_csv('/content/drive/My Drive/Thesis Backup/y_train.csv', index = False)
y_test.to_csv('/content/drive/My Drive/Thesis Backup/y_test.csv', index = False)

In [None]:
X_train_s = pd.read_csv('/content/drive/My Drive/Thesis Backup/X_train_s.csv')
X_test_s = pd.read_csv('/content/drive/My Drive/Thesis Backup/X_test_s.csv')
y_train = pd.read_csv('/content/drive/My Drive/Thesis Backup/y_train.csv')
y_test = pd.read_csv('/content/drive/My Drive/Thesis Backup/y_test.csv')

In [None]:
# Convert to array
X_train_np = X_train_s.ada_embedding.apply(eval).apply(np.array)
X_test_np = X_test_s.ada_embedding.apply(eval).apply(np.array)

In [None]:
X_train = list(X_train_np.values)
X_test = list(X_test_np.values)

In [None]:
#Dealing with imbalanced data
over = SMOTE(sampling_strategy= 0.55)
under = RandomUnderSampler(sampling_strategy= 1.00)

steps= [('o', over),('u',under)]
pipeline = Pipeline(steps=steps)

In [None]:
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)

In [None]:
#Original Distribution
classifier = LogisticRegression(random_state = 42, penalty = 'l2', max_iter = 1000)

In [None]:
stratified_kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

In [None]:
# Evaluate the model using cross-validation
scores = cross_val_score(classifier, X_resampled, y_resampled['Transaction'], cv=5, scoring='roc_auc')

In [None]:
print("AUC-ROC Cross-Validation Results:")
print(f"AUC-ROC: {scores.mean():.4f} (±{scores.std() * 2:.4f})")

## Using Final Distribution

Logistic Regression

In [None]:
# Fit the model on the entire resampled training data
classifier.fit(X_resampled, y_resampled['Transaction'])

In [None]:
y_pred_lr = classifier.predict(X_test)

In [None]:
#Testing Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
fpr_lr, tpr_lr, thresholds = roc_curve(y_test , y_pred_lr)
AUC_lr = auc(fpr_lr, tpr_lr)

print(f"Accuracy: {accuracy_lr:.2f}")
print(f"Precision: {precision_lr:.2f}")
print(f"AUC: {AUC_lr:.2f}")

Linear SVM

In [None]:
#Training SVM-Linear Model
clf = svm.LinearSVC()
clf.fit(X_resampled, y_resampled['Transaction'])
y_pred_svm = clf.predict(X_test)

In [None]:
#Training SVM-Linear Model
clf = svm.LinearSVC(C = 0.01, max_iter = 1000)
clf.fit(X_resampled, y_resampled['Transaction'])
y_pred_svm = clf.predict(X_test)

In [None]:
#Testing SVM
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
fpr_svm, tpr_svm, thresholds = roc_curve(y_test , y_pred_svm)
AUC_svm = auc(fpr_svm, tpr_svm)

print(f"Accuracy SVM: {accuracy_svm:.2f}")
print(f"Precision SVM: {precision_svm:.2f}")
print(f"AUC SVM: {AUC_svm:.2f}")

## Random Forest

In [None]:
#Finding the best parameters
param_dist = {'n_estimators': [100,300,500],
              'max_depth': randint(10,44)}
rf = RandomForestClassifier()

In [None]:
rand_search = RandomizedSearchCV(rf, param_distributions = param_dist, n_iter=10, cv=5, scoring = 'roc_auc')

In [None]:
rand_search.fit(X_resampled, y_resampled['Transaction'])

In [None]:
# Create a variable for the best model
best_rf = rand_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

In [None]:
rf = RandomForestClassifier(n_estimators = 300, max_depth = 10, random_state = 42)
y_pred_rf = best_rf.predict(X_test)

In [None]:
#Testing RF
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
fpr_rf, tpr_rf, thresholds = roc_curve(y_test , y_pred_rf)
AUC_rf = auc(fpr_rf, tpr_rf)

print(f"Accuracy RF: {accuracy_rf}")
print(f"Precision RF: {precision_rf}")
print(f"AUC RF: {AUC_rf}")

## BERT Model

References: 
https://huggingface.co/docs/transformers/model_doc/bert#transformers.TFBertTokenizer

In [None]:
#Checking for a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
#Setting up the BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.to(device)

In [None]:
df_model = pd.read_csv('/content/drive/My Drive/Thesis Backup/df_NYSE_4_short.csv')

In [None]:
#Create a list
texts = []
for i in df_model['combined']:
    texts.append(i)

print(len(texts))

In [None]:
#Running the model
embeddings =[]
for text in texts:
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors="pt").to(device)
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    flattened_embedding = torch.mean(last_hidden_states, dim=1).flatten().detach().cpu().numpy()
    embeddings.append(flattened_embedding)

In [None]:
embeddings_list = [emb.tolist() for emb in embeddings]

In [None]:
#Adding the embeddings to a dataframe
df_model['Bert']= embeddings_list
print(df_model.head())

In [None]:
df_model.to_csv('/content/drive/My Drive/Thesis Backup/df_NYSE_BERT_4.csv', index = False)

Now follow the same steps as for the text-embedding-ada-002 model.

## Adding a prompt

In [None]:
#Opening the dataframe
df_combined = pd.read_csv('c:/Users/xxxxx/Downloads/df_NYSE_1_short.csv')

df_combined['prompt'] = "Let's think step by step if this company will be acquired." + df_combined['combined']

Now follow the same steps as for the text-embedding-ada-002 model.

## Histograms, ROC curves and Confusion Matrices 

In [None]:
#Histogram Targets
plt.hist(df_acquired['Tokens'], bins=25, color = 'darkblue')
plt.xlabel("Token Size")
plt.ylabel("Number of Filings")
plt.savefig("/content/drive/My Drive/Thesis Backup/HistogramA.png")
plt.show()

In [None]:
#ROC Curve
plt.figure()
plt.plot(fpr_svm, tpr_svm, color='blue', lw=2, label='SVM (area = %0.2f)' % AUC_svm)
plt.plot(fpr_lr, tpr_lr, color='green', lw=2, label='Logistic Regression (area = %0.2f)' % AUC_lr)
plt.plot(fpr_rf, tpr_rf, color='red', lw=2, label='Random Forest (area = %0.2f)' % AUC_rf)
plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random Classifier')
plt.legend(loc='lower right')
plt.savefig("/content/drive/My Drive/Thesis Backup/ROC_curves_ada.png")
plt.show()

In [None]:
#Confusion Matrix Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)

sns.heatmap(cm_lr, annot = True, fmt="d", cmap = "Greens")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.savefig("/content/drive/My Drive/Thesis Backup/confusion_matrix_lr_ada.png")
plt.show()