In [3]:


import re
from transformers import AutoTokenizer, AutoModelForMaskedLM
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter
import pandas as pd


nltk.download('punkt')
nltk.download('stopwords')

model_name = 'yiyanghkust/finbert-tone'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)


file_path = 'input_text.txt'  # Replace with your file path if different

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()


stop_words = set(stopwords.words('english'))

custom_stopwords = stop_words.union({"company", "business", "include", "may", "could", "also", 
                                     "would", "page", "see", "us", "used", "result", "ensure", 
                                     "certain", "based", "account", "amounts", "objects", "continue"})


def preprocess_text(text):
   
    text = re.sub(r'\W', ' ', text.lower())
    
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in custom_stopwords and len(word) > 1]
    return filtered_words


filtered_words = preprocess_text(text)


word_freq = Counter(filtered_words)


sorted_word_freq = dict(sorted(word_freq.items(), key=lambda x: x[1], reverse=True))


word_freq_df = pd.DataFrame(list(sorted_word_freq.items()), columns=['Word', 'Frequency'])
print(word_freq_df)


word_freq_df.to_csv('financial_word_frequencies.csv', index=False)

[nltk_data] Downloading package punkt to /Users/sudarsan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sudarsan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of BertForMaskedLM were not initialized from the model checkpoint at yiyanghkust

                  Word  Frequency
0           operations        137
1            financial        134
2             partners        134
3                india        130
4               equity        121
...                ...        ...
2973          validity          1
2974         fiduciary          1
2975            duties          1
2976  responsibilities          1
2977         asserting          1

[2978 rows x 2 columns]


In [None]:

import re
from collections import Counter
import torch
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
sentiment_analyzer = pipeline("sentiment-analysis")

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
from transformers import BertModel
model = BertModel.from_pretrained('bert-base-uncased')

In [1]:

def load_text(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text


def tokenize_and_count(text):
    words = re.findall(r'\b\w+\b', text.lower())
    word_counts = Counter(words)
    return word_counts

In [2]:

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embedding


def extract_high_frequency_terms(word_counts, text, threshold=3):
    high_freq_terms = {word: count for word, count in word_counts.items() if count >= threshold}
    word_embeddings = {word: get_embedding(word) for word in high_freq_terms}
    
    doc_embedding = get_embedding(text)  
    
    return high_freq_terms, word_embeddings, doc_embedding

In [3]:

def compute_net_effect(text, terms, threshold=0):
    term_effects = {}
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    for term in terms:
        term_sentiment = []
        for sentence in sentences:
            if term in sentence.lower():
                sentiment = sentiment_analyzer(sentence)[0]
                term_sentiment.append(sentiment['score'] if sentiment['label'] == "POSITIVE" else -sentiment['score'])


        net_effect = sum(term_sentiment)
        term_effects[term] = "Positive" if net_effect > threshold else "Negative" if net_effect < -threshold else "Neutral"

    return term_effects

In [4]:

def categorize_risks(term_effects, high_freq_terms, word_counts):
    general_risks = []
    specific_risks = []

    for term, effect in term_effects.items():
        if high_freq_terms[term] >= np.percentile(list(word_counts.values()), 75):
            general_risks.append((term, effect))
        else:
            specific_risks.append((term, effect))
    
    return general_risks, specific_risks

In [5]:

def process_risk_text_file(filename):
    text = load_text(filename)
    word_counts = tokenize_and_count(text)
    
    
    high_freq_terms, word_embeddings, doc_embedding = extract_high_frequency_terms(word_counts, text)
    
    
    term_effects = compute_net_effect(text, high_freq_terms)
    
   
    general_risks, specific_risks = categorize_risks(term_effects, high_freq_terms, word_counts)
    
    
    print("\nGeneral Risks with Net Effect:")
    for term, effect in general_risks:
        print(f"{term}: {effect}")
    
    print("\nSpecific Risks with Net Effect:")
    for term, effect in specific_risks:
        print(f"{term}: {effect}")

# Run on an example text file
process_risk_text_file("input_text.txt")

NameError: name 're' is not defined

In [None]:
import csv


def process_risk_text_file(filename, output_csv="risk_analysis.csv"):
    text = load_text(filename)
    word_counts = tokenize_and_count(text)
    
    
    high_freq_terms, word_embeddings, doc_embedding = extract_high_frequency_terms(word_counts, text)
    
    
    term_effects = compute_net_effect(text, high_freq_terms)
    
   
    general_risks, specific_risks = categorize_risks(term_effects, high_freq_terms, word_counts)
    
    risk_data = []
    for term, effect in general_risks:
        risk_data.append({"Risk Type": "General", "Term": term, "Net Effect": effect})
    for term, effect in specific_risks:
        risk_data.append({"Risk Type": "Specific", "Term": term, "Net Effect": effect})
    
    with open(output_csv, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["Risk Type", "Term", "Net Effect"])
        writer.writeheader()
        writer.writerows(risk_data)
    
    print(f"Results have been saved to {output_csv}")

process_risk_text_file("input_text.txt", "risk_analysis.csv")