In [2]:
!pip install transformers torch



In [3]:
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
import pickle
docs = pickle.load(open('raw_docs_filtered.pkl', 'rb'))

In [5]:
topics = {
    "credit": ["credit"],
    "performance": ["performance", "long term performance", "returns", "style drift", "systematic risks", "loss", "tail risks", "maximum drawdown", "track record", "long term"],
    "market": ["market"],
    "investment approach": ["investment approach", "sources of alpha", "investment process"],
    "portfolio": ["portfolio", "portfolio implementation", "risk management", "risk management system", "stop-loss limit", "liquidity", "leverage", "hedging", "risk"],
    "stress": ["stress", "performance short term", "stress test", "scenario analysis", "manager's outlook"],
    "team": ["team", "decision making", "staffing", "employees", "investment professionals", "investment teams", "incentive", "compensation", "investment manager"],
    "organization": ["organization", "esg policies", "administrator", "auditor", "prime broker", "trading system", "compliance", "IT infrastructure", "business continuity"],
    "fees": ["fees", "pass through fee", "management fee", "incentive fee"],
    "dealing terms": ["dealing terms", "lock-up", "pay-out ratio", "subscription", "redemption"],
    "asia": ["asia", "china", "csi"]
}


In [6]:
def extract_context(text, topics, window_size=5):
    """
    Extract context of given topics from text using token-based approach.
    
    Parameters:
    - text (str): The input text.
    - topics (dict): Dictionary with topics as keys and lists of keywords as values.
    - window_size (int): Number of tokens to include before and after the topic keyword.
    
    Returns:
    - dict: Dictionary with topics as keys and list of contexts as values.
    """
    # Process the text
    doc = nlp(text)
    tokens = [token.text for token in doc]
    topic_positions = {topic: set() for topic in topics}
    
    for topic, phrases in topics.items():
        for phrase in phrases:
            phrase_tokens = phrase.split()
            n = len(phrase_tokens)
            
            for i in range(len(tokens) - n + 1):
                ngram_tokens = tokens[i:i + n]
                
                if ngram_tokens == phrase_tokens:
                    start = max(0, i - window_size)
                    end = min(len(doc), i + window_size + 1)
                    context = tokens[start:end]
                    context_text = ' '.join(context)
                    topic_positions[topic].add(context_text)
                    
    
    return topic_positions

In [14]:
docs.head()

Unnamed: 0,fund_mf_id,document_mf_id,fund_name,document_type,document_text
0,3253,123779,Candlestick Cayman Feeder Ltd,quarterly_report,Q1 2024\nC A N D L E S T I C K C A P I T A L\n...
1,2014,123730,Davidson Kempner Long-Term Distressed Opportun...,quarterly_report,confidential\nDanuta Neumann\nHedge Pole\nMay ...
2,3113,123731,Davidson Kempner Long-Term Distressed Opportun...,quarterly_report,confidential\nDanuta Neumann\nHedge Pole\nMay ...
3,936,123678,D.E. Shaw Valence International Fund LP,quarterly_report,1166 Avenue of the Americas\nNinth Floor\nNew ...
4,2780,123676,D.E. Shaw Composite International Fund,quarterly_report,SM\nCOMPOSITE\nQUARTERLY PERFORMANCE SUMMARY M...


In [19]:
def add_context(df):
    df['contexts'] = df['document_text'].apply(lambda x: extract_context(x, topics, window_size=5))
    return df

In [29]:
tmp = add_context(docs[:1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contexts'] = df['document_text'].apply(lambda x: extract_context(x, topics, window_size=5))


In [35]:
tmp['contexts'][0]

{'credit': set(),
 'performance': {'\n allocation . The net returns are calculated based on a',
  '\n indication of the broader performance of the markets among which',
  '\n largest contributors to our performance in the quarter , while',
  '\n r \n QTD \n returns in Q1 equated to 5.1%-5.2',
  '\n t \n L \n returns include “ new issue ”',
  ', struggling to earn economic returns . It became \n clear',
  '- level , unaudited monthly performance information for \n r \n',
  'As such , the net returns include “ new issue ”',
  'Estate sectors detracted from our returns . \n See Notes and',
  'To the extent we quote returns on this index , such',
  'allocation . Furthermore , gross performance is \n i \n r',
  'any \n time . The performance information presented in this summary',
  'are unable to generate economic returns , absent \n a \n',
  'calculate the \n c \n performance information set forth in this',
  'equated to 5.1%-5.2 % net returns for new issue eligible investors',
  'in this

In [20]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax

MODEL = f"soleimanian/financial-roberta-large-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


def get_sentiments(contexts):
    df = pd.DataFrame(columns=['negative', 'neutral', 'positive'])
    for topic, context_list in contexts.items():
        if not context_list:
            continue
        context_list = [f"{context}" for context in context_list]
        encoded_input = tokenizer(context_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = [softmax(x.detach().numpy()) for x in output.logits]
        df2 = pd.DataFrame(scores, columns=['negative', 'neutral', 'positive'])
        df2['text'] = context_list
        df2['topic'] = topic
        df = pd.concat([df, df2], ignore_index=True)
    return df



In [41]:
sent = get_sentiments(tmp['contexts'][0])
sent

  df = pd.concat([df, df2], ignore_index=True)


Unnamed: 0,negative,neutral,positive,text,topic
0,0.000193,0.999537,0.00027,water mark . The cumulative returns for a \n i \n,performance
1,0.000208,0.99958,0.000212,\n t \n L \n returns include “ new issue ”,performance
2,0.000756,0.001121,0.998123,s \n u \n blended performance and high - water...,performance
3,0.00017,0.999533,0.000298,"As such , the net returns include “ new issue ”",performance
4,0.000351,0.999497,0.000152,\n indication of the broader performance of th...,performance
5,0.00024,0.999613,0.000148,of \n market activity and performance . It is ...,performance
6,0.000229,0.999618,0.000153,"To the extent we quote returns on this index ,...",performance
7,0.000194,0.999592,0.000213,in this update . The performance information i...,performance
8,0.000183,0.999594,0.000223,individual investor . The cumulative returns f...,performance
9,0.998816,0.000917,0.000267,", struggling to earn economic returns . It bec...",performance


In [39]:
# find the average sentiment for each topic
def average_sentiments(df):
    del df['text']
    df = df.groupby('topic').mean()
    return df

In [40]:
average_sentiments(sent)

Unnamed: 0_level_0,negative,neutral,positive
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fees,0.000253,0.901886,0.097861
market,0.142418,0.857353,0.000229
performance,0.102252,0.812978,0.08477
portfolio,0.168314,0.389982,0.441704
team,0.000215,0.999539,0.000246
