In [None]:
import numpy as np
from scipy.sparse import random
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt

# Generate a corpus of 100 documents, each containing 1000 words
vocab_size = 10000
num_docs = 100
doc_len = 1000

# Create a vocabulary of 10000 words
vocab = [f'word{i}' for i in range(vocab_size)]

# Generate a random dense vector representing each document
dense_vectors = np.zeros((num_docs, vocab_size))
for i in range(num_docs):
    word_indices = np.random.choice(vocab_size, doc_len)
    for j in word_indices:
        dense_vectors[i, j] += 1

# Convert the dense vectors to sparse format
sparse_vectors = random(num_docs, vocab_size, density=0.01, format='csr')
for i in range(num_docs):
    word_indices = np.random.choice(vocab_size, doc_len)
    for j in word_indices:
        sparse_vectors[i, j] += 1

# Use TruncatedSVD to reduce the dimensionality of the dense vectors
svd = TruncatedSVD(n_components=2)
dense_vectors_svd = svd.fit_transform(dense_vectors)

# Plot the dense and sparse vectors on a scatter plot
fig, ax = plt.subplots(figsize=(10, 8))
ax.scatter(dense_vectors_svd[:, 0], dense_vectors_svd[:, 1], c='b', label='Dense vectors')
ax.scatter(sparse_vectors[:, 0].toarray().flatten(), sparse_vectors[:, 1].toarray().flatten(), c='r', label='Sparse vectors')
ax.set_title('Dense and sparse vector scatter plot')
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
ax.legend()
plt.show()
