In [20]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import math

In [9]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swoye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

TF-IDF

In [11]:
text = """Welcome to the heart of the financial world — the trading floor.

Here, every tick of the market tells a story. Every second counts. Every decision echoes through numbers and charts.

Before we dive in, let me walk you through what makes this space more than just flashing screens — it’s where instinct meets data, and strategy meets speed."""

documents = [p.strip() for p in text.split('\n\n') if p.strip()]

stop_words = set(stopwords.words('english'))

for each_doc in documents:
    print(each_doc)

Welcome to the heart of the financial world — the trading floor.
Here, every tick of the market tells a story. Every second counts. Every decision echoes through numbers and charts.
Before we dive in, let me walk you through what makes this space more than just flashing screens — it’s where instinct meets data, and strategy meets speed.


In [12]:
def tokenize(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

finalized_docs = [tokenize(doc) for doc in documents]
    
for each_doc in finalized_docs:
    print(each_doc)

['Welcome', 'heart', 'financial', 'world', '—', 'trading', 'floor', '.']
['Here', ',', 'every', 'tick', 'market', 'tells', 'story', '.', 'Every', 'second', 'counts', '.', 'Every', 'decision', 'echoes', 'numbers', 'charts', '.']
['Before', 'dive', ',', 'let', 'walk', 'makes', 'space', 'flashing', 'screens', '—', '’', 'instinct', 'meets', 'data', ',', 'strategy', 'meets', 'speed', '.']


TF

In [21]:
def comput_tf(tokens):
    count_dic = Counter(tokens)
    total_tokens = len(tokens)
    tokens_tf = {term: count/total_tokens for term, count in count_dic.items()}
    return tokens_tf

finalized_tf = [comput_tf(doc) for doc in finalized_docs]

for tf in finalized_tf:
    print(tf)

{'Welcome': 0.125, 'heart': 0.125, 'financial': 0.125, 'world': 0.125, '—': 0.125, 'trading': 0.125, 'floor': 0.125, '.': 0.125}
{'Here': 0.05555555555555555, ',': 0.05555555555555555, 'every': 0.05555555555555555, 'tick': 0.05555555555555555, 'market': 0.05555555555555555, 'tells': 0.05555555555555555, 'story': 0.05555555555555555, '.': 0.16666666666666666, 'Every': 0.1111111111111111, 'second': 0.05555555555555555, 'counts': 0.05555555555555555, 'decision': 0.05555555555555555, 'echoes': 0.05555555555555555, 'numbers': 0.05555555555555555, 'charts': 0.05555555555555555}
{'Before': 0.05263157894736842, 'dive': 0.05263157894736842, ',': 0.10526315789473684, 'let': 0.05263157894736842, 'walk': 0.05263157894736842, 'makes': 0.05263157894736842, 'space': 0.05263157894736842, 'flashing': 0.05263157894736842, 'screens': 0.05263157894736842, '—': 0.05263157894736842, '’': 0.05263157894736842, 'instinct': 0.05263157894736842, 'meets': 0.10526315789473684, 'data': 0.05263157894736842, 'strateg

IDF

In [32]:
def compute_idf(tokens):
    doc_len = len(finalized_docs)
    idf_doc = {}
    final_tokens = set(tokens)
    for term in final_tokens:
        term_freq = 0
        for doc in finalized_docs:
            if term in doc:
                term_freq += 1
        idf_doc[term] = math.log((doc_len + 1) / (term_freq + 1)) + 1
    return idf_doc

finalized_idf = [compute_idf(doc) for doc in finalized_docs]

for idf in finalized_idf:
    print(idf)

{'trading': 1.6931471805599454, '.': 1.0, 'financial': 1.6931471805599454, '—': 1.2876820724517808, 'floor': 1.6931471805599454, 'Welcome': 1.6931471805599454, 'heart': 1.6931471805599454, 'world': 1.6931471805599454}
{'Here': 1.6931471805599454, 'tells': 1.6931471805599454, '.': 1.0, ',': 1.2876820724517808, 'numbers': 1.6931471805599454, 'market': 1.6931471805599454, 'Every': 1.6931471805599454, 'story': 1.6931471805599454, 'every': 1.6931471805599454, 'second': 1.6931471805599454, 'tick': 1.6931471805599454, 'echoes': 1.6931471805599454, 'charts': 1.6931471805599454, 'decision': 1.6931471805599454, 'counts': 1.6931471805599454}
{'strategy': 1.6931471805599454, '’': 1.6931471805599454, '.': 1.0, 'space': 1.6931471805599454, 'let': 1.6931471805599454, 'dive': 1.6931471805599454, 'flashing': 1.6931471805599454, 'walk': 1.6931471805599454, ',': 1.2876820724517808, 'instinct': 1.6931471805599454, 'speed': 1.6931471805599454, 'screens': 1.6931471805599454, 'makes': 1.6931471805599454, '—'

TF-IDF

In [44]:
def compute_tfidf(tf, idf):
    tfidf_score = {term: tf_val * idf.get(term, 0) for term, tf_val in tf.items()}
    return tfidf_score

tfidf_doc = [compute_tfidf(tf, finalized_idf[x]) for x, tf in enumerate(finalized_tf)]

for tfidf in tfidf_doc:
    print(tfidf)

{'Welcome': 0.21164339756999317, 'heart': 0.21164339756999317, 'financial': 0.21164339756999317, 'world': 0.21164339756999317, '—': 0.1609602590564726, 'trading': 0.21164339756999317, 'floor': 0.21164339756999317, '.': 0.125}
{'Here': 0.0940637322533303, ',': 0.07153789291398782, 'every': 0.0940637322533303, 'tick': 0.0940637322533303, 'market': 0.0940637322533303, 'tells': 0.0940637322533303, 'story': 0.0940637322533303, '.': 0.16666666666666666, 'Every': 0.1881274645066606, 'second': 0.0940637322533303, 'counts': 0.0940637322533303, 'decision': 0.0940637322533303, 'echoes': 0.0940637322533303, 'numbers': 0.0940637322533303, 'charts': 0.0940637322533303}
{'Before': 0.08911300950315501, 'dive': 0.08911300950315501, ',': 0.13554548131071376, 'let': 0.08911300950315501, 'walk': 0.08911300950315501, 'makes': 0.08911300950315501, 'space': 0.08911300950315501, 'flashing': 0.08911300950315501, 'screens': 0.08911300950315501, '—': 0.06777274065535688, '’': 0.08911300950315501, 'instinct': 0.0

ORDERS OF WORDS IN DOCS

In [50]:
def high_order(ti_docs):
    for i, doc in enumerate(ti_docs):
        print(f"\nFor Doc {i + 1}: \n")
        sorted_doc = sorted(doc.items(), key=lambda x: x[1], reverse=True)[:10]

        for term, score in sorted_doc:
            print(f"{term}: {score:.5f}")

high_order(tfidf_doc)


For Doc 1: 

Welcome: 0.21164
heart: 0.21164
financial: 0.21164
world: 0.21164
trading: 0.21164
floor: 0.21164
—: 0.16096
.: 0.12500

For Doc 2: 

Every: 0.18813
.: 0.16667
Here: 0.09406
every: 0.09406
tick: 0.09406
market: 0.09406
tells: 0.09406
story: 0.09406
second: 0.09406
counts: 0.09406

For Doc 3: 

meets: 0.17823
,: 0.13555
Before: 0.08911
dive: 0.08911
let: 0.08911
walk: 0.08911
makes: 0.08911
space: 0.08911
flashing: 0.08911
screens: 0.08911
