Gather data from Azure Databricks

In [3]:
import pandas as pd
import pyodbc

conn = pyodbc.connect("DSN=Azure Databricks", autocommit=True)
documents = conn.execute("select fund_mf_id, document_mf_id, fund_name, document_type, document_text from fund_document_v3").fetchall()


raw_docs = pd.DataFrame([list(d) for d in documents],
                    columns=['fund_mf_id', 'document_mf_id', 'fund_name', 'document_type', 'document_text'])

In [4]:
raw_docs.head()

Unnamed: 0,fund_mf_id,document_mf_id,fund_name,document_type,document_text
0,370,87604,Voleon International Investors Ltd,fund_offering,
1,403,50362,Pelham Global Financials Fund Ltd,fund_offering,OFFERING MEMORANDUM\ne\n1 JANUARY 2018\nm\ni\n...
2,248,87599,The Winton Fund Limited,fund_offering,PROSPECTUS\nTHE WINTON FUND LIMITED\n(the )\n“...
3,259,50442,Third Point Offshore Fund Ltd,fund_offering,HIGHLY CONFIDENTIAL & TRADE SECRET\nCONFIDENTI...
4,414,114800,Sino Vision - Greater China Market Neutral Fund,fund_offering,PRIVATE OFFERING MEMORANDUM Copy No: _________...


In [5]:
raw_docs = raw_docs[raw_docs['document_text'].str.len() > 0]

Text preprocessing

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Ensure NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize
    words = word_tokenize(text.lower())
    # Remove stop words
    words = [word for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

preprocessed_docs = raw_docs['document_text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viksu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\viksu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic modeling

In [7]:
from bertopic import BERTopic

# Create a BERTopic model
topic_model = BERTopic(n_gram_range=(1, 2))

# Fit the model on the documents
topics, probabilities = topic_model.fit_transform(preprocessed_docs)
# Display topics
topics_info = topic_model.get_topic_info()
print(topics_info)

    Topic  Count                                               Name  \
0      -1   2183                      -1_fund_investment_may_shares   
1       0    685                0_credit_performance_market_returns   
2       1    201                     1_may_icav_investment_relevant   
3       2    187                2_subscriber_person_entity_investor   
4       3    127               3_capital_investment_management_risk   
..    ...    ...                                                ...   
79     78     11      78_fund_master_master fund_investment manager   
80     79     11  79_adviser_segregated_segregated portfolio_shares   
81     80     11                   80_company_shares_may_segregated   
82     81     11              81_wace_29 sep_marshall wace_marshall   
83     82     11              82_subscriber_entity_lackstone_person   

                                       Representation  \
0   [fund, investment, may, shares, master, manage...   
1   [credit, performance, market,

In [None]:
# import pickle
# pickle.dump(topics_info, open('topics_info.pkl', 'wb'))

Evaluation

In [None]:
# export to csv
topics_info.to_csv('topics_info.csv', index=False)