Import book and libraries

In [5]:
import pandas as pd
df = pd.read_csv('/content/financial_sentiment_merged.csv')

print(df[['Cleaned']].head())


                                             Cleaned
0  geosolutions technology leverage benefon gps s...
1                           esi low real possibility
2  last quarter 2010 componenta net sale doubled ...
3  according finnishrussian chamber commerce majo...
4  swedish buyout firm sold remaining percent sta...


# Tokenize the cleaned sentences and build word frequencies

In [6]:
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

# Tokenize each sentence and build frequency count
tokenized_sentences = df['Cleaned'].astype(str).apply(word_tokenize)
word_freq = Counter()

for tokens in tokenized_sentences:
    word_freq.update(tokens)

print(word_freq.most_common(10))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


[('eeenf', 3328), ('tsla', 3015), ('price', 1862), ('company', 1763), ('share', 1758), ('new', 1735), ('buy', 1709), ('today', 1670), ('day', 1427), ('time', 1307)]


##  Create word2idx and idx2word dictionaries

In [9]:
import numpy as np

special_tokens = ['<PAD>', '<UNK>']
all_words = special_tokens + [word for word, freq in word_freq.items() if freq >= 1]

word2idx = {word: idx for idx, word in enumerate(all_words)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"Vocabulary size (including special tokens): {len(word2idx)}")


embeddings = np.random.rand(len(word2idx), 100)

def find_nearest_neighbors(embeddings, target_word_index, num_neighbors=5):
    """
    Finds the nearest neighbors of a given word based on cosine similarity.

    Args:
        embeddings: A NumPy array containing word embeddings.
        target_word_index: The index of the target word in the vocabulary.
        num_neighbors: The number of nearest neighbors to retrieve.

    Returns:
        A list of indices of the nearest neighbors.
    """
    target_embedding = embeddings[target_word_index]
    similarities = np.dot(embeddings, target_embedding) / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(target_embedding))
    nearest_indices = np.argsort(similarities)[-num_neighbors - 1:-1][::-1]  # Exclude the word itself
    return nearest_indices

nearest_neighbors = find_nearest_neighbors(embeddings, word2idx['bull'])
print(f"Nearest neighbors of 'stock': {[idx2word[idx] for idx in nearest_neighbors]}")


Vocabulary size (including special tokens): 36212
Nearest neighbors of 'stock': ['gogol', 'slumping', 'modelsplaid', 'jacked', 'metric']


# Saving Vocabulary.json

In [10]:
import json

vocab = {
    "word2idx": word2idx,
    "idx2word": idx2word
}
with open('vocabulary.json', 'w') as f:
    json.dump(vocab, f)

print("Vocabulary saved as 'vocabulary.json'")


Vocabulary saved as 'vocabulary.json'
