In [3]:
!pip install transformers torch



In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
import pickle
docs = pickle.load(open('raw_docs_filtered.pkl', 'rb'))

In [6]:
topics = {
    "credit": ["credit"],
    "performance": ["performance", "long term performance", "returns", "style drift", "systematic risks", "loss", "tail risks", "maximum drawdown", "track record", "long term"],
    "market": ["market"],
    "investment approach": ["investment approach", "sources of alpha", "investment process"],
    "portfolio": ["portfolio", "portfolio implementation", "risk management", "risk management system", "stop-loss limit", "liquidity", "leverage", "hedging", "risk"],
    "stress": ["stress", "performance short term", "stress test", "scenario analysis", "manager's outlook"],
    "team": ["team", "decision making", "staffing", "employees", "investment professionals", "investment teams", "incentive", "compensation", "investment manager"],
    "organization": ["organization", "esg policies", "administrator", "auditor", "prime broker", "trading system", "compliance", "IT infrastructure", "business continuity"],
    "fees": ["fees", "pass through fee", "management fee", "incentive fee"],
    "dealing terms": ["dealing terms", "lock-up", "pay-out ratio", "subscription", "redemption"],
    "asia": ["asia", "china", "csi"]
}


In [7]:
def extract_context(text, topics, window_size=5):
    """
    Extract context of given topics from text using token-based approach.
    
    Parameters:
    - text (str): The input text.
    - topics (dict): Dictionary with topics as keys and lists of keywords as values.
    - window_size (int): Number of tokens to include before and after the topic keyword.
    
    Returns:
    - dict: Dictionary with topics as keys and list of contexts as values.
    """
    # Process the text
    doc = nlp(text)
    tokens = [token.text for token in doc]
    topic_positions = {topic: set() for topic in topics}
    
    for topic, phrases in topics.items():
        for phrase in phrases:
            phrase_tokens = phrase.split()
            n = len(phrase_tokens)
            
            for i in range(len(tokens) - n + 1):
                ngram_tokens = tokens[i:i + n]
                
                if ngram_tokens == phrase_tokens:
                    start = max(0, i - window_size)
                    end = min(len(doc), i + window_size + 1)
                    context = tokens[start:end]
                    context_text = ' '.join(context)
                    topic_positions[topic].add(context_text)
                    
    
    return topic_positions

In [8]:
docs.head()

Unnamed: 0,fund_mf_id,document_mf_id,fund_name,document_type,document_text
0,3253,123779,Candlestick Cayman Feeder Ltd,quarterly_report,Q1 2024\nC A N D L E S T I C K C A P I T A L\n...
1,2014,123730,Davidson Kempner Long-Term Distressed Opportun...,quarterly_report,confidential\nDanuta Neumann\nHedge Pole\nMay ...
2,3113,123731,Davidson Kempner Long-Term Distressed Opportun...,quarterly_report,confidential\nDanuta Neumann\nHedge Pole\nMay ...
3,936,123678,D.E. Shaw Valence International Fund LP,quarterly_report,1166 Avenue of the Americas\nNinth Floor\nNew ...
4,2780,123676,D.E. Shaw Composite International Fund,quarterly_report,SM\nCOMPOSITE\nQUARTERLY PERFORMANCE SUMMARY M...


In [9]:
def get_context(s):
    return extract_context(s['document_text'], topics, window_size=5)

def add_context(df):
    df['contexts'] = df['document_text'].apply(lambda x: extract_context(x, topics, window_size=5))
    return df

In [10]:
tmp = add_context(docs[:1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['contexts'] = df['document_text'].apply(lambda x: extract_context(x, topics, window_size=5))


In [11]:
tmp['contexts'][0]

{'credit': set(),
 'performance': {'\n allocation . The net returns are calculated based on a',
  '\n indication of the broader performance of the markets among which',
  '\n largest contributors to our performance in the quarter , while',
  '\n r \n QTD \n returns in Q1 equated to 5.1%-5.2',
  '\n t \n L \n returns include “ new issue ”',
  ', struggling to earn economic returns . It became \n clear',
  '- level , unaudited monthly performance information for \n r \n',
  'As such , the net returns include “ new issue ”',
  'Estate sectors detracted from our returns . \n See Notes and',
  'To the extent we quote returns on this index , such',
  'allocation . Furthermore , gross performance is \n i \n r',
  'any \n time . The performance information presented in this summary',
  'are unable to generate economic returns , absent \n a \n',
  'calculate the \n c \n performance information set forth in this',
  'equated to 5.1%-5.2 % net returns for new issue eligible investors',
  'in this

In [12]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax

MODEL = f"soleimanian/financial-roberta-large-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


def get_sentiments(contexts):
    df = pd.DataFrame(columns=['negative', 'neutral', 'positive'])
    for topic, context_list in contexts.items():
        if not context_list:
            continue
        context_list = [f"{context}" for context in context_list]
        encoded_input = tokenizer(context_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = [softmax(x.detach().numpy()) for x in output.logits]
        df2 = pd.DataFrame(scores, columns=['negative', 'neutral', 'positive'])
        df2['text'] = context_list
        df2['topic'] = topic
        df = pd.concat([df, df2], ignore_index=True)
    return df

In [13]:
sent = get_sentiments(tmp['contexts'][0])
sent

Unnamed: 0,negative,neutral,positive,text,topic
0,0.000491,0.998991,0.000518,’ offering documents . Net returns of other in...,performance
1,0.00079,0.998585,0.000625,offering \n documents . Net returns of other i...,performance
2,0.000209,0.999596,0.000195,any \n time . The performance information pres...,performance
3,0.00021,0.999625,0.000165,\n allocation . The net returns are calculated...,performance
4,0.000351,0.999497,0.000152,\n indication of the broader performance of th...,performance
5,0.000277,0.999568,0.000155,"or financial trends or \n performance , which ...",performance
6,0.000197,0.999602,0.0002,provides a breakdown of our performance by sec...,performance
7,0.00024,0.999613,0.000148,of \n market activity and performance . It is ...,performance
8,0.000208,0.99958,0.000212,\n t \n L \n returns include “ new issue ”,performance
9,0.000756,0.001121,0.998123,s \n u \n blended performance and high - water...,performance


In [14]:
# find the average sentiment for each topic
def average_sentiments(df):
    return df[['negative', 'neutral', 'positive', 'topic']].groupby('topic').mean()

def aggregate_sentiment(scores) -> pd.Series:
    return (
        -1 * scores['negative'] +
        0 * scores['neutral'] +
        1 * scores['positive']
    )

In [15]:
average_sentiments(sent)

Unnamed: 0_level_0,negative,neutral,positive
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fees,0.000253,0.901886,0.097861
market,0.142418,0.857353,0.000229
performance,0.102252,0.812978,0.08477
portfolio,0.168314,0.389982,0.441705
team,0.000215,0.999539,0.000246


Get all scores

In [16]:
tmp.head()['contexts'][0]

{'credit': set(),
 'performance': {'\n allocation . The net returns are calculated based on a',
  '\n indication of the broader performance of the markets among which',
  '\n largest contributors to our performance in the quarter , while',
  '\n r \n QTD \n returns in Q1 equated to 5.1%-5.2',
  '\n t \n L \n returns include “ new issue ”',
  ', struggling to earn economic returns . It became \n clear',
  '- level , unaudited monthly performance information for \n r \n',
  'As such , the net returns include “ new issue ”',
  'Estate sectors detracted from our returns . \n See Notes and',
  'To the extent we quote returns on this index , such',
  'allocation . Furthermore , gross performance is \n i \n r',
  'any \n time . The performance information presented in this summary',
  'are unable to generate economic returns , absent \n a \n',
  'calculate the \n c \n performance information set forth in this',
  'equated to 5.1%-5.2 % net returns for new issue eligible investors',
  'in this

In [17]:
from tqdm import tqdm
def topic_sentiment_analysis(documents):
    scores_dict = {}
    for i, doc in tqdm(documents.iterrows(), total=documents.shape[0]):
        context = get_context(doc)
        scores = get_sentiments(context)
        if scores.empty:
            continue
        scores_dict[doc['document_mf_id']] = aggregate_sentiment(average_sentiments(scores))
    return pd.DataFrame(scores_dict)

In [18]:
res = topic_sentiment_analysis(docs)
res

100%|██████████| 877/877 [55:53<00:00,  3.82s/it]  


Unnamed: 0_level_0,123779,123730,123731,123678,123676,123677,123615,123483,123280,123281,...,66920,84387,45581,83759,69773,100780,43397,86265,100779,98672
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
credit,,0.178673,-0.001035,,,,0.283389,0.033357,0.498359,0.289802,...,-0.4988453,,,0.249558,-0.026119,,,,,
dealing terms,,-0.005857,0.214961,,,,,,,,...,,0.000111,0.000326,,-4.1e-05,0.001717,0.00039,0.009697,0.001717,-0.164886
fees,0.097609,0.16625,6.9e-05,,7.4e-05,,2.9e-05,0.001532,0.000131,0.000115,...,0.4956582,0.000237,0.038349,,0.046134,0.000257,9.8e-05,,0.000257,0.996133
investment approach,,,,,,,,,,,...,0.9344968,,,,0.338085,,,,,
market,-0.14219,-0.001087,6.7e-05,,6e-06,,0.047601,-0.23025,0.123491,0.214307,...,0.2016708,,-0.048678,-0.123748,0.041713,3.2e-05,-0.497505,,3.2e-05,0.283777
organization,,,,-2.8e-05,,-2.8e-05,,0.002151,,,...,,,0.22127,,0.18307,,,,,-5.7e-05
performance,-0.017482,0.374337,0.080113,,-0.083941,,0.287031,-0.006213,-0.16648,0.013529,...,0.1576942,,-0.137388,-0.073745,-0.031463,,-0.199276,-2.5e-05,,0.002121
portfolio,0.273391,0.316954,0.246493,,-0.103765,,0.079394,0.078808,0.181571,0.194602,...,-0.01595792,,0.332168,0.006035,0.077594,0.546249,-0.083987,3.5e-05,0.546249,
stress,,,,,,,,-0.618588,,-6.2e-05,...,3.247696e-07,,,,,,,,,
team,3.1e-05,0.411577,0.000324,,0.002166,,0.33298,0.502923,-0.03915,0.993443,...,0.4685752,,0.106267,0.997403,-6e-06,,0.000124,1.2e-05,,1.9e-05


In [19]:
import pickle
with open('topic_sentiment_analysis.pkl', 'wb') as f:
    pickle.dump(res, f)