In [2]:
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

# --- 1. Custom Dataset Creation ---
# Let's create a simple dataset of sentences.
# We'll include sentences from different topics to see how the embeddings
# might group related words together (e.g., sports, finance, weather).
dataset = [
    "The stock market saw a significant rally today.",
    "Investors are optimistic about the new trade deal.",
    "Heavy rain is expected to cause flooding in the city.",
    "The football team won the championship in the final game.",
    "A new smartphone was launched with advanced features.",
    "Economic growth shows signs of slowing down.",
    "The star player scored the winning goal.",
    "A massive storm is approaching the coast.",
    "Technology stocks are leading the market surge.",
    "The coach celebrated the victory with his team.",
    "Heavy snowfall is causing travel disruptions.",
    "Financial analysts predict a market correction.",
    "A new software update includes bug fixes.",
    "The quarterback threw a touchdown pass.",
    "The weather forecast predicts sunny skies.",
    "The company's stock price soared.",
]

# --- 2. Data Tokenization and Padding ---
# Tokenization converts words to integers. We use a Tokenizer from Keras.
# We will use a smaller vocabulary size to keep the visualization manageable.
max_words = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(dataset)

# Convert sentences to sequences of integers.
sequences = tokenizer.texts_to_sequences(dataset)

# Pad the sequences to ensure they all have the same length.
# This is a requirement for feeding data into a neural network.
max_sequence_length = max(len(s) for s in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post', truncating='post')

# We'll use a dummy binary classification task to train the model.
# The model doesn't need to be good at this task; the goal is just
# to force the embedding layer to learn meaningful representations.
labels = np.array([1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1])

# --- 3. Model Training with Embedding Layer ---
# Define the parameters for our embedding layer.
embedding_dim = 16  # We'll choose a small dimension for simplicity.
vocab_size = len(tokenizer.word_index) + 1  # Total number of unique words + 1 for padding/OOV.

# Build a simple sequential model with an Embedding layer.
model = Sequential()
# The Embedding layer takes integer inputs and maps them to dense vectors.
# The weights are initialized randomly and trained during the process.
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid')) # A simple classification layer.

# Compile the model.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model. The embedding layer's weights are adjusted here.
print("Training the model...")
model.fit(padded_sequences, labels, epochs=50, verbose=0)
print("Model training complete.")

# --- 4. Extract Learned Embedding Vectors ---
# The embedding vectors are the weights of the first layer in our model.
# We can access them directly.
embedding_layer = model.layers[0]
weights = embedding_layer.get_weights()[0]

# --- 5. Visualize Embeddings with t-SNE ---
# t-SNE is a non-linear dimensionality reduction technique.
# It's excellent for visualizing high-dimensional data in 2D or 3D.
print("Applying t-SNE for dimensionality reduction...")
tsne = TSNE(n_components=2, perplexity=5, n_iter=1000, random_state=42)
# We'll apply t-SNE to all vectors except the padding/OOV token at index 0.
tsne_vectors = tsne.fit_transform(weights[1:])

# Create a dictionary to map integer index back to words for plotting labels.
reverse_word_index = dict([(value, key) for (key, value) in tokenizer.word_index.items()])

# Plot the results.
plt.figure(figsize=(10, 8))
for i in range(len(tsne_vectors)):
    word_index = i + 1
    # Check if the word exists in our reverse index.
    if word_index in reverse_word_index:
        word = reverse_word_index[word_index]
        # Plot the point.
        plt.scatter(tsne_vectors[i, 0], tsne_vectors[i, 1])
        # Annotate the point with the word.
        plt.annotate(word,
                     xy=(tsne_vectors[i, 0], tsne_vectors[i, 1]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')

plt.title('t-SNE Visualization of Custom Word Embeddings')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.grid(True)
plt.show()



Training the model...
Model training complete.
Applying t-SNE for dimensionality reduction...


NameError: name 'TSNE' is not defined