In [1]:
import string
from datetime import datetime
from functools import wraps
from multiprocessing.pool import Pool
from multiprocessing.spawn import freeze_support
from pathlib import Path

import nltk
import numpy as np
from nltk import word_tokenize, PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.api import StemmerI

nltk.download('punkt')
nltk.download('stopwords')

if __name__ == '__main__':
    freeze_support()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\calga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\calga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def print_dict_ordered_on_keys(d: dict, reverse: bool = True) -> None:
    """
    Prints a dictionary ordered based on the keys

    Args:
        d: the dictionary to predict
        reverse: a flag to say whether to reverse the sort or not

    Returns:
        None
    """
    print({term: frequency for term, frequency in sorted(d.items(), key=lambda x: x[1], reverse=reverse)})

In [3]:
def log_time(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = datetime.now()
        result = func(*args, **kwargs)
        end_time = datetime.now()
        total_time = end_time - start_time
        print(f'Function {func.__name__} Took {total_time}')
        return result
    return timeit_wrapper

In [4]:
with open(
        "../../../FilestoreRepo/FTSE100Info/txtReports/3i_group/20220628_JP_Morgan_III-LN_3i-_Exclusive_talks_to_sell_Havea_with_proceeds_expected.tei.xml.txt",
        "r", encoding="utf-8") as f:
    document = f.read()
print(document)


Europe Equity Research
23 Jun 2022
/J PMorgan
ChristopherBrown
christopher.brown@jpmorgan.com
CFAAdamKelly
adam.kelly@jpmorgan.com
JP MorganSecurities
LiamMacdonald-Raggett -Sales
JPMorgan
Investment Companies
UK
Investment Industry Regulatory Organization
Canada
Europe Equity Research
23 Jun 2022
37FDCB7B9D122FDA48C78B46395D668F
GROBID - A machine learning software for extracting information from scholarly documents
J.P. Morgan does and seeks to do business with companies covered in its research reports. As a result, investors should be aware that the firm may have a conflict of interest that could affect the objectivity of this report. Investors should consider this report as only a single factor in making their investment decision.
3i has announced that it, and its co-investor Cathay Capital, has entered into exclusive negotiations to sell Havea Group to BC Partners, alongside Havea's management team. Closing is expected in Q4 22 and proceeds are expected to be at around a 50% upli

In [5]:
# Replace all punctuation with white space and make lower case
document = "".join([" " if t in string.punctuation else t for t in document]).lower()

In [6]:
# Split into tokens
doc_tokens = set(word_tokenize(document))
print(doc_tokens)

{'2242', 'sale', '25', 'encouraging', 'obligations', 'back', 'recommendations', 'banking', 'outcome', 'subsidiaries', 'jun', 'ministry', 'contrary', 'within', 'promotion', 'jurisdictions', 'at', 'directed', 'engaged', 'note', 'grobid', 'tax', 'gain', 'recipients', 'bureau', 'differ', 'announced', 'sanctioned', 'east', 'accordingly', 'if', 'defined', 'actual', 'funds', '£456m', 'sipc', 'governmental', 'central', 'silom', 'uncertainties', 'add', 'can', 'recession', 'estimates123', 'prevention', 'rates', 'since', 'prudential', 'indicative', 'nyse', 'collectively', 'premium', 'please', 'assigned', 'j', 'jpmss', 'businesses', '3990', 'discount', 'coverage', 'firms', 'intervention', 'entire', 'according', 'take', 'conflicts', 'professional', 'do', 'fundamental', '06', 'less', 'would', 'disclosures', 'england', 'nfa', 'change', 'top', 'changes', 'documents', 'connection', 'objectives', 'h1', 'exchange', '09', 'africa', 'event', 'accurate', 'notice', 'its', 'account', '45', 'finance', 'will', 

In [7]:
print(len(doc_tokens))

941


In [8]:
ps = PorterStemmer()
doc_tokens = {ps.stem(doc_token) for doc_token in doc_tokens}
# print(doc_tokens)
print(len(doc_tokens))

799


In [9]:
doc_tokens = {doc_token for doc_token in doc_tokens if doc_token not in stopwords.words("english")}
print(len(doc_tokens))
# print(doc_tokens)

733


In [10]:
def preprocess_text(text_or_filepath: str | Path, stemmer: StemmerI = PorterStemmer(), as_list: bool = False)\
        -> list[str] | set[str]:
    """
    Pre-processes text by:
        removing punctuation
        tokenizing the string
        removing stop words
        stemming tokens to common roots

    Args:
        text_or_filepath: the text or the filepath to the file containing the text to run pre-processing on
        stemmer: the stemmer to use if not given a default stemmer of the nltk.stem.porter.PorterStemmer is used
        as_list: if true will calculate and return values as a list else will calculate and return as a set

    Returns:
        Either a list or a set of all pre-processed tokens in the text
        If a list it will have the number of duplicate terms as occur in the text
    """
    if isinstance(text_or_filepath, Path):
        if not text_or_filepath.is_file():
            print(f"filepath {text_or_filepath.resolve()} doesn't point to a file")
        with open(text_or_filepath, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        text = text_or_filepath

    # Replace all punctuation with white space and make lower case
    text = "".join([" " if t in string.punctuation else t for t in text]).lower()
    if as_list:
        # Return tokens as a list
        # Split into tokens
        doc_tokens = word_tokenize(text)
        # Remove Stopwords
        doc_tokens = [doc_token for doc_token in doc_tokens if doc_token not in stopwords.words("english")]
        # Stem Tokens
        doc_tokens = [stemmer.stem(doc_token) for doc_token in doc_tokens]
    else:
        # Return tokens as a set
        # Split into tokens
        doc_tokens = set(word_tokenize(text))
        # Remove Stopwords
        doc_tokens = {doc_token for doc_token in doc_tokens if doc_token not in stopwords.words("english")}
        # Stem Tokens
        doc_tokens = {stemmer.stem(doc_token) for doc_token in doc_tokens}
    return doc_tokens

In [11]:
root_folder_path = Path("../../../FilestoreRepo/FTSE100Info/txtReports/3i_group")
term_doc_count = dict()
num_docs = 0
for filepath in root_folder_path.rglob("*.txt"):
    print(f"{filepath.resolve()}")
    if filepath.is_file():
        num_docs += 1
        with open(filepath, "r", encoding="utf-8") as f:
            document = f.read()
        doc_tokens = preprocess_text(document, as_list=False)
        for token in doc_tokens:
            if token in term_doc_count:
                term_doc_count[token] += 1
            else:
                term_doc_count[token] = 1

C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220524_Barclays_III-LN_3i_Group_PLC-_Management_presentation_highlights_that_th.tei.xml.txt
C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220525_JP_Morgan_III-LN_3i-_11-_fall_yesterday_on_poor_US_retail_sentiment_is_an.tei.xml.txt
C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220607__-EGAN_JONES_on_3i_Group_plc.tei.xml.txt
C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220628_JP_Morgan_III-LN_3i-_Exclusive_talks_to_sell_Havea_with_proceeds_expected.tei.xml.txt
C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220706_JP_Morgan_III-LN_3i-_Portfolio_update_reassures_no_changes_to_estimates.tei.xml.txt
C:\Users\calga\Documents\GitHubRepos\FilestoreRepo\FTSE100Info\txtReports\3i_group\20220707_Barclays_III-LN_3i_Group_PLC-_AGM_statement_reiterates_portfolio_r

In [12]:
print(term_doc_count)

{'roll': 3, '202214': 4, '2242': 9, 'sale': 15, 'judg': 2, '506504': 6, 'demand': 5, '25': 10, 'evalu': 6, 'besant': 5, 'difc': 9, 'tokyo': 9, 'dw': 5, 'back': 14, 'powder': 1, 'villag': 6, 'includ': 15, 'jul': 7, '400': 5, 'track': 7, 'jun': 7, 'hous': 5, 'compound': 3, 'start': 8, 'within': 15, 'perform': 15, 'month': 14, 'west': 9, 'full': 3, '1002': 6, 'anniversari': 1, 'discuss': 14, 'iiroc': 6, 'austria': 6, 'intellectu': 3, 'oblig': 10, 'note': 12, 'tax': 8, 'grobid': 16, 'transact': 11, 'gain': 7, 'execut': 7, 'differ': 15, 'relev': 14, 'east': 13, 'understand': 6, 'boulevard': 6, 'gate': 9, 'split': 6, 'fbk': 6, 'malta': 6, 'spend': 4, 'liquid': 14, 'actual': 13, 'sipc': 11, 'central': 14, 'lost': 6, 'hatchley': 1, 'user': 6, 'furthermor': 14, 'abu': 6, 'healthcar': 8, 'equal': 6, 'unauthor': 6, 'mediolanum': 5, 'unit': 9, 'emg': 6, 'premium': 12, 'accordingli': 14, 'j': 14, '106': 6, 'regist': 14, 'discount': 14, 'standard': 12, 'histor': 8, 'servic': 15, 'therewith': 6, 'lia

In [13]:
idf = {term: np.log(num_docs / frequency) for term, frequency in term_doc_count.items()}

In [14]:
print_dict_ordered_on_keys(idf)

{'powder': 2.772588722239781, 'anniversari': 2.772588722239781, 'hatchley': 2.772588722239781, 'host': 2.772588722239781, 'withdrawn': 2.772588722239781, 'restart': 2.772588722239781, 'frame': 2.772588722239781, 'justifi': 2.772588722239781, 'perceiv': 2.772588722239781, 'eurofund': 2.772588722239781, 'julia': 2.772588722239781, 'reach': 2.772588722239781, 'zaimukyokuchoasia': 2.772588722239781, 'focal': 2.772588722239781, 'gradual': 2.772588722239781, 'ten23': 2.772588722239781, 'pilot': 2.772588722239781, '4338a27f44e32d6e7a0fd342f3cddc82': 2.772588722239781, 'situat': 2.772588722239781, 'meaning': 2.772588722239781, 'wilson': 2.772588722239781, 'sander': 2.772588722239781, 'overweightamundi': 2.772588722239781, 'lengthen': 2.772588722239781, 'positivebanca': 2.772588722239781, 'crisi': 2.772588722239781, '3914': 2.772588722239781, '1735': 2.772588722239781, 'salesforc': 2.772588722239781, 'demograph': 2.772588722239781, 'poland': 2.772588722239781, 'aggress': 2.772588722239781, '241

In [15]:
test_file_path: Path = Path(
    "../../../FilestoreRepo/FTSE100Info/txtReports/3i_group/20220628_JP_Morgan_III-LN_3i-_Exclusive_talks_to_sell_Havea_with_proceeds_expected.tei.xml.txt")

with open(test_file_path, "r", encoding="utf-8") as f:
    test_document = f.read()

# Get the tokens from the test document
query_tokens: list = preprocess_text(test_document, as_list=True)

# Generate the unique list of words
unique_tokens = set(query_tokens)

wordcounts = {token: query_tokens.count(token) for token in unique_tokens}
print(wordcounts)

{'2242': 1, 'sale': 6, 'need': 1, '25': 1, 'back': 1, 'suffici': 2, 'includ': 4, 'navig': 1, 'jun': 2, 'illiquid': 1, 'aggreg': 1, 'within': 11, 'month': 7, 'perform': 1, 'discuss': 3, 'oblig': 2, 'note': 4, 'grobid': 1, 'tax': 1, 'transact': 2, 'gain': 2, 'bureau': 1, 'differ': 3, 'relev': 10, 'prevent': 1, 'websit': 3, 'east': 1, 'actual': 1, '£456m': 1, 'sipc': 1, 'central': 1, 'silom': 1, 'unbundl': 2, 'furthermor': 1, 'add': 1, 'estimates123': 1, 'healthcar': 1, 'undu': 1, 'nyse': 1, 'premium': 3, 'jpmss': 4, 'accordingli': 2, 'j': 35, '3990': 1, 'comp': 1, 'regist': 4, 'discount': 5, 'servic': 3, 'liabil': 2, 'specif': 5, 'take': 4, 'activ': 3, 'condit': 1, 'europ': 4, '06': 1, 'less': 3, 'would': 1, 'england': 1, 'nfa': 1, 'top': 1, 'import': 1, 'h1': 1, '09': 2, 'director': 1, 'chang': 2, 'africa': 2, 'event': 3, 'account': 5, '45': 1, '23': 3, 'sanction': 3, 'australia': 2, 'overli': 1, 'canada': 1, 'cfaadamkelli': 1, 'make': 4, 'unless': 6, 'disclosurest': 1, 'continu': 1, '2

In [16]:
tfidf = {term: frequency * idf[term] for term, frequency in wordcounts.items()}
print(tfidf)

{'2242': 0.5753641449035618, 'sale': 0.387231126825427, 'need': 1.1631508098056809, '25': 0.47000362924573563, 'back': 0.13353139262452257, 'suffici': 1.3862943611198906, 'includ': 0.25815408455028466, 'navig': 2.772588722239781, 'jun': 1.6533571463689358, 'illiquid': 2.772588722239781, 'aggreg': 0.9808292530117262, 'within': 0.7099237325132828, 'month': 0.934719748371658, 'perform': 0.06453852113757116, 'discuss': 0.4005941778735677, 'oblig': 0.9400072584914713, 'note': 1.1507282898071234, 'grobid': 0.0, 'tax': 0.6931471805599453, 'transact': 0.7493868988828214, 'gain': 1.6533571463689358, 'bureau': 1.3862943611198906, 'differ': 0.1936155634127135, 'relev': 1.3353139262452256, 'prevent': 1.1631508098056809, 'websit': 1.7260924347106854, 'east': 0.20763936477824455, 'actual': 0.20763936477824455, '£456m': 2.772588722239781, 'sipc': 0.3746934494414107, 'central': 0.13353139262452257, 'silom': 0.6931471805599453, 'unbundl': 3.347952867143343, 'furthermor': 0.13353139262452257, 'add': 1.6

In [17]:
@log_time
def calculate_idf_for_corpus(root_folder_path: Path) -> tuple[dict[str, int], dict[str, float], int]:
    """
    calculates the idf of terms from all .txt files beneath the folder path given

    Args:
        root_folder_path: the Path object to the folder that contains the txt files to be analyzed

    Returns:
        A tuple of 3 objects containing:
        1. A dictionary from term to inverse document frequency
        2. A dictionary from term to count of number of documents it occurs in
        3. The count of number of documents analyzed
    """
    term_doc_count = dict()
    num_docs = 0
    for filepath in root_folder_path.rglob("*.txt"):
        if filepath.is_file():
            num_docs += 1
            with open(filepath, "r", encoding="utf-8") as f:
                document = f.read()
            doc_tokens = preprocess_text(document, as_list=False)
            for token in doc_tokens:
                if token in term_doc_count:
                    term_doc_count[token] += 1
                else:
                    term_doc_count[token] = 1

    idf = {term: np.log(num_docs / frequency) for term, frequency in term_doc_count.items()}
    return idf, term_doc_count, num_docs

In [18]:
root_folder_path = Path("../../../FilestoreRepo/FTSE100Info/txtReports/3i_group")
idf, term_count, num_docs = calculate_idf_for_corpus(root_folder_path)

Function calculate_idf_for_corpus Took 0:00:04.438132


In [19]:
print_dict_ordered_on_keys(idf)

{'powder': 2.772588722239781, 'anniversari': 2.772588722239781, 'hatchley': 2.772588722239781, 'host': 2.772588722239781, 'withdrawn': 2.772588722239781, 'restart': 2.772588722239781, 'frame': 2.772588722239781, 'justifi': 2.772588722239781, 'perceiv': 2.772588722239781, 'eurofund': 2.772588722239781, 'julia': 2.772588722239781, 'reach': 2.772588722239781, 'zaimukyokuchoasia': 2.772588722239781, 'focal': 2.772588722239781, 'gradual': 2.772588722239781, 'ten23': 2.772588722239781, 'pilot': 2.772588722239781, '4338a27f44e32d6e7a0fd342f3cddc82': 2.772588722239781, 'situat': 2.772588722239781, 'meaning': 2.772588722239781, 'wilson': 2.772588722239781, 'sander': 2.772588722239781, 'overweightamundi': 2.772588722239781, 'lengthen': 2.772588722239781, 'positivebanca': 2.772588722239781, 'crisi': 2.772588722239781, '3914': 2.772588722239781, '1735': 2.772588722239781, 'salesforc': 2.772588722239781, 'demograph': 2.772588722239781, 'poland': 2.772588722239781, 'aggress': 2.772588722239781, '241

In [20]:
@log_time
def calculate_idf_for_corpus_parrallel(root_folder_path: Path, max_num_processes: int = 16) -> tuple[
    dict[str, int], dict[str, float], int]:
    """
    calculates the idf of terms from all .txt files beneath the folder path given in parralell

    Args:
        root_folder_path: the Path object to the folder that contains the txt files to be analyzed

    Returns:
        A tuple of 3 objects containing:
        1. A dictionary from term to inverse document frequency
        2. A dictionary from term to count of number of documents it occurs in
        3. The count of number of documents analyzed
    """
    term_doc_count = dict()
    filepaths = [filepath for filepath in root_folder_path.rglob("*.txt") if filepath.is_file()]
    num_docs = len(filepaths)
    num_processes = min(max_num_processes, num_docs)

    if num_processes > 0:
        with Pool(num_processes) as pool:
            doc_tokens_list: list[dict[str, int]] = pool.map(preprocess_text, filepaths)

    for doc_tokens in doc_tokens_list:
        for token in doc_tokens:
            if token in term_doc_count:
                term_doc_count[token] += 1
            else:
                term_doc_count[token] = 1

    idf = {term: np.log(num_docs / frequency) for term, frequency in term_doc_count.items()}
    return idf, term_doc_count, num_docs

In [None]:
root_folder_path = Path("../../../FilestoreRepo/FTSE100Info/txtReports/3i_group")
idf, term_count, num_docs = calculate_idf_for_corpus_parrallel(root_folder_path)

In [None]:
print_dict_ordered_on_keys(idf)

In [None]:
def calculate_idf_for_corpus(root_):
    term_doc_count = dict()
    num_docs = 0
    for filepath in root_folder_path.rglob("*.txt"):
        print(f"{filepath.resolve()}")
        if filepath.is_file():
            num_docs += 1
            with open(filepath, "r", encoding="utf-8") as f:
                document = f.read()
            doc_tokens = preprocess_text(document, as_list=False)
            for token in doc_tokens:
                if token in term_doc_count:
                    term_doc_count[token] += 1
                else:
                    term_doc_count[token] = 1

    return {term: np.log(num_docs/frequency) for term, frequency in term_doc_count.items()}


In [None]:
root_folder_path = Path("../../../FilestoreRepo/FTSE100Info/txtReports/3i_group")
idf = calculate_idf_for_corpus(root_folder_path)