Gather data from Azure Databricks

In [2]:
import pandas as pd
import pyodbc

conn = pyodbc.connect("DSN=Azure Databricks", autocommit=True)
documents = conn.execute("select fund_mf_id, document_mf_id, fund_name, document_type, document_text from fund_document_v3").fetchall()


raw_docs = pd.DataFrame([list(d) for d in documents],
                    columns=['fund_mf_id', 'document_mf_id', 'fund_name', 'document_type', 'document_text'])

SystemError: <built-in function connect> returned a result with an exception set

In [2]:
raw_docs.head()

Unnamed: 0,fund_mf_id,document_mf_id,fund_name,document_type,document_text
0,370,87604,Voleon International Investors Ltd,fund_offering,
1,403,50362,Pelham Global Financials Fund Ltd,fund_offering,OFFERING MEMORANDUM\ne\n1 JANUARY 2018\nm\ni\n...
2,248,87599,The Winton Fund Limited,fund_offering,PROSPECTUS\nTHE WINTON FUND LIMITED\n(the )\n“...
3,259,50442,Third Point Offshore Fund Ltd,fund_offering,HIGHLY CONFIDENTIAL & TRADE SECRET\nCONFIDENTI...
4,414,114800,Sino Vision - Greater China Market Neutral Fund,fund_offering,PRIVATE OFFERING MEMORANDUM Copy No: _________...


In [7]:
raw_docs = raw_docs[raw_docs['document_text'].str.len() > 0]

Text preprocessing

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize
    words = word_tokenize(text.lower())
    # Remove stop words
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

preprocessed_docs = raw_docs['document_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viksu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\viksu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic modeling

In [12]:
from bertopic import BERTopic

# Create a BERTopic model
topic_model = BERTopic()

# Fit the model on the documents
topics, probabilities = topic_model.fit_transform(preprocessed_docs)
# TODO: Bigrams
# Display topics
topics_info = topic_model.get_topic_info()
print(topics_info)

    Topic  Count                                    Name  \
0      -1   2461           -1_fund_investment_may_master   
1       0    238      0_performance_index_return_returns   
2       1    176       1_subscriber_person_entity_please   
3       2    152    2_credit_bluebay_performance_returns   
4       3    112         3_icav_may_instruments_relevant   
..    ...    ...                                     ...   
93     92     11        92_common_acomf_partnership_fund   
94     93     11       93_201910_street_state_subscriber   
95     94     10  94_mudrick_kempner_davidson_distressed   
96     95     10                      95_der_die_und_von   
97     96     10     96_hudson_intermediate_bay_investor   

                                       Representation  \
0   [fund, investment, may, master, shares, compan...   
1   [performance, index, return, returns, msci, do...   
2   [subscriber, person, entity, please, investor,...   
3   [credit, bluebay, performance, returns, capita.

Evaluation

In [13]:
# export to csv
topics_info.to_csv('topics_info.csv', index=False)